Coverage for skema/rest/equation_extraction.py: 12%
74 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 17:15 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 17:15 +0000
1import os
2import requests
3import json
4from time import sleep
5from IPython.display import clear_output
8def process_images_in_folder(folder_path: str, gpt_key: str) -> None:
9 """
10 Process PNG images in a folder to detect equations using an API.
12 Args:
13 folder_path (str): Path to the folder containing PNG images.
14 gpt_key (str): API key for accessing the equation detection service.
16 Returns:
17 None
18 """
19 # URL for equation detection service
20 url = "http://54.227.237.7/integration/equation_classifier"
22 # Ensure the API key is available
23 if not gpt_key:
24 raise ValueError("OPENAI_API_KEY environment variable is not set.")
26 # Append the API key to the URL as a query parameter
27 url_with_key = f"{url}?gpt_key={gpt_key}"
29 # Dictionary to store results
30 results = []
32 # Iterate over PNG files in the folder
33 for filename in os.listdir(folder_path):
34 if filename.endswith(".png"):
35 image_path = os.path.join(folder_path, filename)
36 files = {"image": (filename, open(image_path, "rb"), "image/png")}
38 # Send POST request to the equation detection service
39 response = requests.post(url_with_key, files=files)
41 # Close the file
42 files["image"][1].close()
44 # Check response status code
45 if response.status_code == 200:
46 data = response.json()
47 result = {
48 "filename": filename,
49 "contains_equation": data["is_equation"],
50 "latex_equation": data["equation_text"],
51 }
52 results.append(result)
53 else:
54 # If request fails, add default result and raise an error
55 result = {
56 "filename": filename,
57 "contains_equation": False,
58 "latex_equation": None,
59 }
60 results.append(result)
61 print(
62 f"Request for {filename} failed with status code:",
63 response.status_code,
64 )
66 # Sleep to avoid overwhelming the API
67 sleep(3)
69 # Write results to a JSON file
70 output_file = f"{folder_path}/equation_results.json"
71 with open(output_file, "w") as json_file:
72 json.dump(results, json_file, indent=4)
74 print("Results written to", output_file)
77COSMOS_BASE_URL: str = "http://cosmos0004.chtc.wisc.edu:8088/cosmos_service"
80def download_images_from_pdf(pdf_local_path: str, save_folder: str) -> None:
81 # Ensure the save folder exists
82 if not os.path.exists(save_folder):
83 os.makedirs(save_folder)
85 # Submit the locally copied PDF to the COSMOS processing pipeline
86 submit_endpoint: str = COSMOS_BASE_URL + "/process/"
87 with open(pdf_local_path, "rb") as pdf_to_parse:
88 file_form: dict = {"pdf": pdf_to_parse}
89 data_form: dict = {"compress_images": False}
90 response: requests.Response = requests.post(
91 submit_endpoint, files=file_form, data=data_form
92 )
94 response_data: dict = response.json()
95 job_id: str = response_data["job_id"]
97 status_endpoint: str = response_data["status_endpoint"]
98 results_endpoint: str = response_data["result_endpoint"]
100 POLL_COUNT: int = 80
101 POLL_INTERVAL: int = 5
103 job_done: bool = False
105 for i in range(POLL_COUNT):
106 response: requests.Response = requests.get(status_endpoint)
107 response_data: dict = response.json()
108 clear_output(wait=True)
109 print(f"Polled status endpoint {i} times:\n{response_data}")
110 job_done: bool = response_data["error"] or response_data["job_completed"]
111 if job_done:
112 break
113 sleep(POLL_INTERVAL)
115 if not job_done:
116 raise RuntimeError(
117 f"Job not complete after {POLL_COUNT * POLL_INTERVAL} seconds."
118 )
119 elif response_data["error"]:
120 raise RuntimeError(f"An unexpected error occurred: {response_data['error']}")
121 else:
122 print(
123 f"Job succeeded after {response_data['time_processing']} seconds.\n"
124 f"Results can be viewed at {results_endpoint}"
125 )
127 # Extracted document equations, bounding boxes, and images
128 equation_data: dict = requests.get(
129 f"{results_endpoint}/extractions/equations"
130 ).json()
132 # Download images
133 for equation in equation_data:
134 img_url: str = equation["img_pth"]
135 img_name: str = img_url.split("/")[-1]
136 img_save_path: str = os.path.join(save_folder, img_name)
137 try:
138 img_response: requests.Response = requests.get(img_url)
139 with open(img_save_path, "wb") as img_file:
140 img_file.write(img_response.content)
141 print(f"Image downloaded: {img_name}")
142 except Exception as e:
143 raise RuntimeError(f"Failed to download image {img_name}: {e}")
146def process_pdf_and_images(pdf_local_path: str, save_folder: str, gpt_key: str) -> None:
147 """
148 Download images from a PDF file and then process them to detect equations.
150 Args:
151 pdf_local_path (str): Path to the local PDF file.
152 save_folder (str): Path to the folder where images will be saved.
153 gpt_key (str): API key for accessing the equation detection service.
155 Returns:
156 None
157 """
158 # Download images from the PDF file
159 download_images_from_pdf(pdf_local_path, save_folder)
161 # Process the downloaded images to detect equations
162 process_images_in_folder(save_folder, gpt_key)