Coverage for skema/rest/equation_extraction.py: 12%

74 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 17:15 +0000

1import os 

2import requests 

3import json 

4from time import sleep 

5from IPython.display import clear_output 

6 

7 

8def process_images_in_folder(folder_path: str, gpt_key: str) -> None: 

9 """ 

10 Process PNG images in a folder to detect equations using an API. 

11 

12 Args: 

13 folder_path (str): Path to the folder containing PNG images. 

14 gpt_key (str): API key for accessing the equation detection service. 

15 

16 Returns: 

17 None 

18 """ 

19 # URL for equation detection service 

20 url = "http://54.227.237.7/integration/equation_classifier" 

21 

22 # Ensure the API key is available 

23 if not gpt_key: 

24 raise ValueError("OPENAI_API_KEY environment variable is not set.") 

25 

26 # Append the API key to the URL as a query parameter 

27 url_with_key = f"{url}?gpt_key={gpt_key}" 

28 

29 # Dictionary to store results 

30 results = [] 

31 

32 # Iterate over PNG files in the folder 

33 for filename in os.listdir(folder_path): 

34 if filename.endswith(".png"): 

35 image_path = os.path.join(folder_path, filename) 

36 files = {"image": (filename, open(image_path, "rb"), "image/png")} 

37 

38 # Send POST request to the equation detection service 

39 response = requests.post(url_with_key, files=files) 

40 

41 # Close the file 

42 files["image"][1].close() 

43 

44 # Check response status code 

45 if response.status_code == 200: 

46 data = response.json() 

47 result = { 

48 "filename": filename, 

49 "contains_equation": data["is_equation"], 

50 "latex_equation": data["equation_text"], 

51 } 

52 results.append(result) 

53 else: 

54 # If request fails, add default result and raise an error 

55 result = { 

56 "filename": filename, 

57 "contains_equation": False, 

58 "latex_equation": None, 

59 } 

60 results.append(result) 

61 print( 

62 f"Request for {filename} failed with status code:", 

63 response.status_code, 

64 ) 

65 

66 # Sleep to avoid overwhelming the API 

67 sleep(3) 

68 

69 # Write results to a JSON file 

70 output_file = f"{folder_path}/equation_results.json" 

71 with open(output_file, "w") as json_file: 

72 json.dump(results, json_file, indent=4) 

73 

74 print("Results written to", output_file) 

75 

76 

77COSMOS_BASE_URL: str = "http://cosmos0004.chtc.wisc.edu:8088/cosmos_service" 

78 

79 

80def download_images_from_pdf(pdf_local_path: str, save_folder: str) -> None: 

81 # Ensure the save folder exists 

82 if not os.path.exists(save_folder): 

83 os.makedirs(save_folder) 

84 

85 # Submit the locally copied PDF to the COSMOS processing pipeline 

86 submit_endpoint: str = COSMOS_BASE_URL + "/process/" 

87 with open(pdf_local_path, "rb") as pdf_to_parse: 

88 file_form: dict = {"pdf": pdf_to_parse} 

89 data_form: dict = {"compress_images": False} 

90 response: requests.Response = requests.post( 

91 submit_endpoint, files=file_form, data=data_form 

92 ) 

93 

94 response_data: dict = response.json() 

95 job_id: str = response_data["job_id"] 

96 

97 status_endpoint: str = response_data["status_endpoint"] 

98 results_endpoint: str = response_data["result_endpoint"] 

99 

100 POLL_COUNT: int = 80 

101 POLL_INTERVAL: int = 5 

102 

103 job_done: bool = False 

104 

105 for i in range(POLL_COUNT): 

106 response: requests.Response = requests.get(status_endpoint) 

107 response_data: dict = response.json() 

108 clear_output(wait=True) 

109 print(f"Polled status endpoint {i} times:\n{response_data}") 

110 job_done: bool = response_data["error"] or response_data["job_completed"] 

111 if job_done: 

112 break 

113 sleep(POLL_INTERVAL) 

114 

115 if not job_done: 

116 raise RuntimeError( 

117 f"Job not complete after {POLL_COUNT * POLL_INTERVAL} seconds." 

118 ) 

119 elif response_data["error"]: 

120 raise RuntimeError(f"An unexpected error occurred: {response_data['error']}") 

121 else: 

122 print( 

123 f"Job succeeded after {response_data['time_processing']} seconds.\n" 

124 f"Results can be viewed at {results_endpoint}" 

125 ) 

126 

127 # Extracted document equations, bounding boxes, and images 

128 equation_data: dict = requests.get( 

129 f"{results_endpoint}/extractions/equations" 

130 ).json() 

131 

132 # Download images 

133 for equation in equation_data: 

134 img_url: str = equation["img_pth"] 

135 img_name: str = img_url.split("/")[-1] 

136 img_save_path: str = os.path.join(save_folder, img_name) 

137 try: 

138 img_response: requests.Response = requests.get(img_url) 

139 with open(img_save_path, "wb") as img_file: 

140 img_file.write(img_response.content) 

141 print(f"Image downloaded: {img_name}") 

142 except Exception as e: 

143 raise RuntimeError(f"Failed to download image {img_name}: {e}") 

144 

145 

146def process_pdf_and_images(pdf_local_path: str, save_folder: str, gpt_key: str) -> None: 

147 """ 

148 Download images from a PDF file and then process them to detect equations. 

149 

150 Args: 

151 pdf_local_path (str): Path to the local PDF file. 

152 save_folder (str): Path to the folder where images will be saved. 

153 gpt_key (str): API key for accessing the equation detection service. 

154 

155 Returns: 

156 None 

157 """ 

158 # Download images from the PDF file 

159 download_images_from_pdf(pdf_local_path, save_folder) 

160 

161 # Process the downloaded images to detect equations 

162 process_images_in_folder(save_folder, gpt_key)