@@ -31,22 +31,41 @@ class DocumentHelper:
3131 def __init__ (self , ax_client : Axiomatic ):
3232 self ._ax_client = ax_client
3333
34- def pdf_from_url (self , url : str ) -> ParseResponse :
35- """Download a PDF document from a URL and parse it into a Markdown response."""
36- if "arxiv" in url and "abs" in url :
37- url = url .replace ("abs" , "pdf" )
38- print ("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF." )
39- file = requests .get (url )
40- response = self ._ax_client .document .parse (file = file .content )
41- return response
42-
43- def pdf_from_file (self , path : str ) -> ParseResponse :
34+ def pdf_from_file (self , path : str ):
4435 """Open a PDF document from a file path and parse it into a Markdown response."""
4536 with open (path , "rb" ) as f :
46- file = f .read ()
47- response = self ._ax_client .document .parse (file = file )
37+ file_bytes = f .read ()
38+
39+ # Create a tuple with (filename, content and content-type)
40+ # we do this because .parse expects a FastAPI Uploadfile
41+ file_name = path .split ("/" )[- 1 ]
42+ file_tuple = (file_name , file_bytes , "application/pdf" )
43+
44+ response = self ._ax_client .document .parse (file = file_tuple )
4845 return response
4946
47+ def pdf_from_url (self , url : str ):
48+ """Download a PDF document from a URL and parse it into a Markdown response."""
49+ if "arxiv.org" in url and "abs" in url :
50+ url = url .replace ("abs" , "pdf" )
51+ print ("The URL is an arXiv abstract page. Replacing 'abs' with 'pdf' to download the PDF." )
52+ response = requests .get (url )
53+
54+ if response .status_code != 200 :
55+ raise Exception (f"Failed to download PDF. Status code: { response .status_code } " )
56+
57+ # Extract filename from URL or use a default
58+ file_name = url .split ("/" )[- 1 ]
59+ if not file_name .endswith (".pdf" ):
60+ file_name = "document.pdf"
61+
62+ # Create a tuple with (filename, content and content-type)
63+ # we do this because .parse expects a FastAPI Uploadfile
64+ file_tuple = (file_name , response .content , "application/pdf" )
65+
66+ parse_response = self ._ax_client .document .parse (file = file_tuple )
67+ return parse_response
68+
5069 def plot_b64_images (self , images : Dict [str , str ]):
5170 """Plot a dictionary of base64 images."""
5271 import ipywidgets as widgets # type: ignore
0 commit comments