11from __future__ import annotations
22
33import ast
4+ import contextlib
45import csv
56import hashlib
67import json
3738
3839from paperqa .utils import (
3940 bytes_to_string ,
41+ compute_unique_doc_id ,
4042 create_bibtex_key ,
4143 encode_id ,
4244 format_bibtex ,
4345 get_citation_ids ,
4446 maybe_get_date ,
47+ md5sum ,
4548 string_to_bytes ,
4649)
4750from paperqa .version import __version__ as pqa_version
6164 "docname" ,
6265 "dockey" ,
6366 "citation" ,
67+ "content_hash" , # Metadata providers won't give this
6468}
69+ # Sentinel to autopopulate a field within model_validator
70+ AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
6571
6672
6773class Doc (Embeddable ):
@@ -70,6 +76,13 @@ class Doc(Embeddable):
7076 docname : str
7177 dockey : DocKey
7278 citation : str
79+ content_hash : str | None = Field (
80+ default = AUTOPOPULATE_VALUE ,
81+ description = (
82+ "Optional hash of the document's contents (to reiterate, not a file path to"
83+ " the document, but the document's contents itself)."
84+ ),
85+ )
7386 # Sort the serialization to minimize the diff of serialized objects
7487 fields_to_overwrite_from_metadata : Annotated [set [str ], PlainSerializer (sorted )] = (
7588 Field (
@@ -171,10 +184,6 @@ def __hash__(self) -> int:
171184 return hash ((self .name , self .text ))
172185
173186
174- # Sentinel to autopopulate a field within model_validator
175- AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
176-
177-
178187class Context (BaseModel ):
179188 """A class to hold the context of a question."""
180189
@@ -660,8 +669,8 @@ class DocDetails(Doc):
660669 doc_id : str | None = Field (
661670 default = None ,
662671 description = (
663- "Unique ID for this document. Simple ways to acquire one include "
664- " hashing the DOI or a stringifying a UUID ."
672+ "Unique ID for this document. A simple and robust way to acquire one is "
673+ " hashing the paper content's hash concatenate with the lowercased DOI ."
665674 ),
666675 )
667676 file_location : str | os .PathLike | None = Field (
@@ -720,9 +729,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
720729 doi = doi .replace (url_prefix_to_remove , "" )
721730 data ["doi" ] = doi .lower ()
722731 if not data .get ("doc_id" ): # keep user defined doc_ids
723- data ["doc_id" ] = encode_id (doi . lower ( ))
732+ data ["doc_id" ] = compute_unique_doc_id (doi , data . get ( "content_hash" ))
724733 elif not data .get ("doc_id" ): # keep user defined doc_ids
725- data ["doc_id" ] = encode_id ( uuid4 ( ))
734+ data ["doc_id" ] = compute_unique_doc_id ( doi , data . get ( "content_hash" ))
726735
727736 if "dockey" in data .get (
728737 "fields_to_overwrite_from_metadata" ,
@@ -933,6 +942,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
933942 data ["citation" ] = data .get ("title" ) or CITATION_FALLBACK_DATA ["title" ]
934943 return data
935944
945+ @classmethod
946+ def populate_content_hash (cls , data : dict [str , Any ]) -> dict [str , Any ]:
947+ if ( # Check for missing or autopopulate value, but preserve `None`
948+ data .get ("content_hash" , AUTOPOPULATE_VALUE ) == AUTOPOPULATE_VALUE
949+ ):
950+ data ["content_hash" ] = None # Assume we don't have it
951+ if data .get ("file_location" ): # Try to update it
952+ with contextlib .suppress (FileNotFoundError ):
953+ data ["content_hash" ] = md5sum (data ["file_location" ])
954+ return data
955+
936956 @model_validator (mode = "before" )
937957 @classmethod
938958 def validate_all_fields (cls , data : Mapping [str , Any ]) -> dict [str , Any ]:
@@ -952,6 +972,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
952972 data [possibly_str_field ], str
953973 ):
954974 data [possibly_str_field ] = ast .literal_eval (data [possibly_str_field ])
975+ data = cls .populate_content_hash (data )
955976 data = cls .lowercase_doi_and_populate_doc_id (data )
956977 data = cls .remove_invalid_authors (data )
957978 data = cls .misc_string_cleaning (data )
@@ -1112,6 +1133,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
11121133 )
11131134 else :
11141135 merged_data [field ] = max (self_value , other_value )
1136+ elif field == "content_hash" and ( # noqa: PLR0916
1137+ # Hashes are both present but differ
1138+ (self_value and other_value and self_value != other_value )
1139+ # One hash is explicitly disabled (not autopopulated)
1140+ or (self_value is None or other_value is None )
1141+ ):
1142+ # We don't know which to pick, so just discard the value
1143+ merged_data [field ] = None
11151144
11161145 else :
11171146 # Prefer non-null values, default preference for 'other' object.
@@ -1126,10 +1155,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
11261155 else self_value
11271156 )
11281157
1129- # Recalculate doc_id if doi has changed
1130- if merged_data ["doi" ] != self .doi :
1131- merged_data ["doc_id" ] = (
1132- encode_id (merged_data ["doi" ].lower ()) if merged_data ["doi" ] else None
1158+ if (
1159+ merged_data ["doi" ] != self .doi
1160+ or merged_data ["content_hash" ] != self .content_hash
1161+ ):
1162+ # Recalculate doc_id if doi or content hash has changed
1163+ merged_data ["doc_id" ] = compute_unique_doc_id (
1164+ merged_data ["doi" ], merged_data .get ("content_hash" )
11331165 )
11341166
11351167 # Create and return new DocDetails instance
0 commit comments