Coverage for skema/rest/equation

1import os

2import requests

3import json

4from time import sleep

5from IPython.display import clear_output

8def process_images_in_folder(folder_path: str, gpt_key: str) -> None:

9 """

10 Process PNG images in a folder to detect equations using an API.

12 Args:

13 folder_path (str): Path to the folder containing PNG images.

14 gpt_key (str): API key for accessing the equation detection service.

16 Returns:

17 None

18 """

19 # URL for equation detection service

20 url = "http://54.227.237.7/integration/equation_classifier"

22 # Ensure the API key is available

23 if not gpt_key:

24 raise ValueError("OPENAI_API_KEY environment variable is not set.")

26 # Append the API key to the URL as a query parameter

27 url_with_key = f"{url}?gpt_key={gpt_key}"

29 # Dictionary to store results

30 results = []

32 # Iterate over PNG files in the folder

33 for filename in os.listdir(folder_path):

34 if filename.endswith(".png"):

35 image_path = os.path.join(folder_path, filename)

36 files = {"image": (filename, open(image_path, "rb"), "image/png")}

38 # Send POST request to the equation detection service

39 response = requests.post(url_with_key, files=files)

41 # Close the file

42 files["image"][1].close()

44 # Check response status code

45 if response.status_code == 200:

46 data = response.json()

47 result = {

48 "filename": filename,

49 "contains_equation": data["is_equation"],

50 "latex_equation": data["equation_text"],

51 }

52 results.append(result)

53 else:

54 # If request fails, add default result and raise an error

55 result = {

56 "filename": filename,

57 "contains_equation": False,

58 "latex_equation": None,

59 }

60 results.append(result)

61 print(

62 f"Request for {filename} failed with status code:",

63 response.status_code,

64 )

66 # Sleep to avoid overwhelming the API

67 sleep(3)

69 # Write results to a JSON file

70 output_file = f"{folder_path}/equation_results.json"

71 with open(output_file, "w") as json_file:

72 json.dump(results, json_file, indent=4)

74 print("Results written to", output_file)

77COSMOS_BASE_URL: str = "http://cosmos0004.chtc.wisc.edu:8088/cosmos_service"

80def download_images_from_pdf(pdf_local_path: str, save_folder: str) -> None:

81 # Ensure the save folder exists

82 if not os.path.exists(save_folder):

83 os.makedirs(save_folder)

85 # Submit the locally copied PDF to the COSMOS processing pipeline

86 submit_endpoint: str = COSMOS_BASE_URL + "/process/"

87 with open(pdf_local_path, "rb") as pdf_to_parse:

88 file_form: dict = {"pdf": pdf_to_parse}

89 data_form: dict = {"compress_images": False}

90 response: requests.Response = requests.post(

91 submit_endpoint, files=file_form, data=data_form

92 )

94 response_data: dict = response.json()

95 job_id: str = response_data["job_id"]

97 status_endpoint: str = response_data["status_endpoint"]

98 results_endpoint: str = response_data["result_endpoint"]

100 POLL_COUNT: int = 80

101 POLL_INTERVAL: int = 5

102

103 job_done: bool = False

104

105 for i in range(POLL_COUNT):

106 response: requests.Response = requests.get(status_endpoint)

107 response_data: dict = response.json()

108 clear_output(wait=True)

109 print(f"Polled status endpoint {i} times:\n{response_data}")

110 job_done: bool = response_data["error"] or response_data["job_completed"]

111 if job_done:

112 break

113 sleep(POLL_INTERVAL)

114

115 if not job_done:

116 raise RuntimeError(

117 f"Job not complete after {POLL_COUNT * POLL_INTERVAL} seconds."

118 )

119 elif response_data["error"]:

120 raise RuntimeError(f"An unexpected error occurred: {response_data['error']}")

121 else:

122 print(

123 f"Job succeeded after {response_data['time_processing']} seconds.\n"

124 f"Results can be viewed at {results_endpoint}"

125 )

126

127 # Extracted document equations, bounding boxes, and images

128 equation_data: dict = requests.get(

129 f"{results_endpoint}/extractions/equations"

130 ).json()

131

132 # Download images

133 for equation in equation_data:

134 img_url: str = equation["img_pth"]

135 img_name: str = img_url.split("/")[-1]

136 img_save_path: str = os.path.join(save_folder, img_name)

137 try:

138 img_response: requests.Response = requests.get(img_url)

139 with open(img_save_path, "wb") as img_file:

140 img_file.write(img_response.content)

141 print(f"Image downloaded: {img_name}")

142 except Exception as e:

143 raise RuntimeError(f"Failed to download image {img_name}: {e}")

144

145

146def process_pdf_and_images(pdf_local_path: str, save_folder: str, gpt_key: str) -> None:

147 """

148 Download images from a PDF file and then process them to detect equations.

149

150 Args:

151 pdf_local_path (str): Path to the local PDF file.

152 save_folder (str): Path to the folder where images will be saved.

153 gpt_key (str): API key for accessing the equation detection service.

154

155 Returns:

156 None

157 """

158 # Download images from the PDF file

159 download_images_from_pdf(pdf_local_path, save_folder)

160

161 # Process the downloaded images to detect equations

162 process_images_in_folder(save_folder, gpt_key)

Coverage for skema/rest/equation_extraction.py: 12%

74 statements