Coverage for skema/rest/integrated_text_reading_proxy.py: 66%

297 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 17:15 +0000

1# Client code for SKEMA TR 

2import io 

3import itertools as it 

4import json 

5import tempfile 

6import time 

7from pathlib import Path 

8from typing import List, Union, BinaryIO, Callable 

9from typing import Optional, Dict, Any 

10from zipfile import ZipFile 

11 

12import pandas as pd 

13import requests 

14import httpx 

15from askem_extractions.data_model import AttributeCollection 

16from askem_extractions.importers import import_arizona 

17from fastapi import APIRouter, Depends, FastAPI, UploadFile, Response, status 

18from langchain.tools.e2b_data_analysis.tool import UploadedFile 

19 

20from skema.rest.proxies import SKEMA_TR_ADDRESS, MIT_TR_ADDRESS, OPENAI_KEY, COSMOS_ADDRESS 

21from skema.rest.schema import ( 

22 TextReadingInputDocuments, 

23 TextReadingAnnotationsOutput, 

24 TextReadingDocumentResults, 

25 TextReadingError, MiraGroundingInputs, MiraGroundingOutputItem, TextReadingEvaluationResults, 

26) 

27from skema.rest import utils, metal_proxy 

28 

29router = APIRouter() 

30 

31 

32# Utility code for the endpoints 

33 

34def annotate_with_skema( 

35 endpoint: str, 

36 input_: Union[str, List[str], List[Dict], List[List[Dict]]]) -> List[Dict[str, Any]]: 

37 """ Blueprint for calling the SKEMA-TR API """ 

38 

39 if isinstance(input_, (str, dict)): 

40 payload = [ 

41 input_ 

42 ] # If the text to annotate is a single string representing the contents of a document, make it a list with 

43 # a single element 

44 else: 

45 payload = input_ # if the text to annotate is already a list of documents to annotate, it is the payload itself 

46 response = requests.post(endpoint, json=payload, timeout=600) 

47 if response.status_code == 200: 

48 return response.json() 

49 else: 

50 raise RuntimeError( 

51 f"Calling {endpoint} failed with HTTP code {response.status_code}" 

52 ) 

53 

54 

55def annotate_text_with_skema(text: Union[str, List[str]]) -> List[Dict[str, Any]]: 

56 return annotate_with_skema(f"{SKEMA_TR_ADDRESS}/textFileToMentions", text) 

57 

58 

59def annotate_pdfs_with_skema( 

60 pdfs: Union[List[List[Dict]], List[Dict]]) -> List[Dict[str, Any]]: 

61 return annotate_with_skema(f"{SKEMA_TR_ADDRESS}/cosmosJsonToMentions", pdfs) 

62 

63 

64# Client code for MIT TR 

65def annotate_text_with_mit( 

66 texts: Union[str, List[str]] 

67) -> Union[List[Dict[str, Any]], str]: 

68 endpoint = f"{MIT_TR_ADDRESS}/annotation/upload_file_extract" 

69 if isinstance(texts, str): 

70 texts = [ 

71 texts 

72 ] # If the text to annotate is a single string representing the contents of a document, make it a list with 

73 # a single element 

74 

75 # TODO parallelize this 

76 return_values = list() 

77 for ix, text in enumerate(texts): 

78 params = {"gpt_key": OPENAI_KEY} 

79 files = {"file": io.StringIO(text)} 

80 response = requests.post(endpoint, params=params, files=files) 

81 try: 

82 if response.status_code == 200: 

83 return_values.append(response.json()) 

84 else: 

85 return_values.append( 

86 f"Calling {endpoint} on the {ix}th input failed with HTTP code {response.status_code}" 

87 ) 

88 except Exception as ex: 

89 return_values.append( 

90 f"Calling {endpoint} on the {ix}th input failed with exception {ex}" 

91 ) 

92 return return_values 

93 

94 

95def normalize_extractions( 

96 arizona_extractions: Optional[Dict[str, Any]], mit_extractions: Optional[Dict] 

97) -> AttributeCollection: 

98 collections = list() 

99 with tempfile.TemporaryDirectory() as tmpdirname: 

100 tmp_dir = Path(tmpdirname) 

101 skema_path = tmp_dir / "skema.json" 

102 

103 canonical_mit, canonical_arizona = None, None 

104 

105 if arizona_extractions: 

106 try: 

107 with skema_path.open("w") as f: 

108 json.dump(arizona_extractions, f) 

109 canonical_arizona = import_arizona(Path(skema_path)) 

110 collections.append(canonical_arizona) 

111 except Exception as ex: 

112 print(ex) 

113 if mit_extractions: 

114 try: 

115 # MIT extractions already come normalized 

116 canonical_mit = AttributeCollection.from_json(mit_extractions) 

117 collections.append(canonical_mit) 

118 except Exception as ex: 

119 print(ex) 

120 

121 if arizona_extractions and mit_extractions: 

122 # Merge both with some de de-duplications 

123 params = {"gpt_key": OPENAI_KEY} 

124 

125 skema_path = tmp_dir / "canonical_skema.json" 

126 mit_path = tmp_dir / "canonical_mit.json" 

127 

128 canonical_arizona.save_json(skema_path) 

129 canonical_mit.save_json(mit_path) 

130 

131 data = { 

132 "mit_file": mit_path.open(), 

133 "arizona_file": skema_path.open(), 

134 } 

135 response = requests.post( 

136 f"{MIT_TR_ADDRESS}/integration/get_mapping", params=params, files=data 

137 ) 

138 

139 # MIT merges the collection for us 

140 if response.status_code == 200: 

141 merged_collection = AttributeCollection.from_json(response.json()) 

142 # Return the merged collection here 

143 return merged_collection 

144 

145 # Merge the collections into a attribute collection 

146 attributes = list(it.chain.from_iterable(c.attributes for c in collections)) 

147 

148 return AttributeCollection(attributes=attributes) 

149 

150 

151def parquet_to_json(path): 

152 parquet_df = pd.read_parquet(path) 

153 parquet_json = parquet_df.to_json() 

154 parquet_data = json.loads(parquet_json) 

155 

156 if len(parquet_data) > 0: 

157 parquet_data_keys = list(parquet_data.keys()) 

158 num_data_rows = max( 

159 [int(k) for k in parquet_data[parquet_data_keys[0]]] 

160 ) 

161 

162 row_order_parquet_data = [dict() for i in range(num_data_rows + 1)] 

163 for field_key, row_data in parquet_data.items(): 

164 for row_idx, datum in row_data.items(): 

165 row_idx_num = int(row_idx) 

166 row_order_parquet_data[row_idx_num][field_key] = datum 

167 

168 # if filename == "documents.parquet": 

169 # Sorts the content sections by page number and then by 

170 # bounding box location. Use x-pos first to account for 

171 # multi-column documents and then sort by y-pos. 

172 row_order_parquet_data.sort( 

173 key=lambda d: ( 

174 d["page_num"], 

175 d["bounding_box"][0] 

176 // 500, # allows for indentation while still catching items across the center line 

177 # (d["bounding_box"][0]) // 100 

178 # + round((d["bounding_box"][0] % 100 // 10) / 10), 

179 d["bounding_box"][1], 

180 ) 

181 ) 

182 

183 edits = list() 

184 for e1, extraction1 in enumerate(row_order_parquet_data): 

185 (ext1_x1, ext1_y1, ext1_x2, ext1_y2) = extraction1[ 

186 "bounding_box" 

187 ] 

188 # Don't bother processing for left-justified or centered 

189 # content ... only right column content needs to be checked 

190 if ext1_x1 < 500: 

191 continue 

192 

193 ext1_page_num = extraction1["page_num"] 

194 found_col_break = False 

195 insertion_index = -1 

196 t1 = e1 

197 while t1 > 0: 

198 extraction2 = row_order_parquet_data[t1 - 1] 

199 ext2_page_num = extraction2["page_num"] 

200 # If the previous sorted entry is on an earlier page 

201 # then we can stop our search 

202 if ext1_page_num > ext2_page_num: 

203 break 

204 

205 (ext2_x1, ext2_y1, ext2_x2, ext2_y2) = extraction2[ 

206 "bounding_box" 

207 ] 

208 

209 if ext1_y2 <= ext2_y1: 

210 ext2_xspan = ext2_x2 - ext2_x1 

211 # Useful heuristic cutoff for now 

212 if ext2_xspan >= 800: 

213 found_col_break = True 

214 insertion_index = t1 - 1 

215 t1 -= 1 

216 if found_col_break: 

217 edits.append( 

218 { 

219 "del_idx": e1, 

220 "ins_idx": insertion_index, 

221 "val": extraction1, 

222 } 

223 ) 

224 for edit_dict in edits: 

225 del row_order_parquet_data[edit_dict["del_idx"]] 

226 row_order_parquet_data.insert( 

227 edit_dict["ins_idx"], edit_dict["val"] 

228 ) 

229 row_order_parquet_data.sort(key=lambda d: (d["pdf_name"])) 

230 

231 name2results = dict() 

232 for row_data in row_order_parquet_data: 

233 if row_data["pdf_name"] in name2results: 

234 name2results[row_data["pdf_name"]].append(row_data) 

235 else: 

236 name2results[row_data["pdf_name"]] = [row_data] 

237 

238 return next(iter(name2results.items()))[1] 

239 

240 

241def cosmos_client(name: str, data: BinaryIO): 

242 """ 

243 Posts a pdf to COSMOS and returns the JSON representation of the parquet file 

244 

245 """ 

246 

247 # Create POST request to COSMOS server 

248 # Prep the pdf data for upload 

249 files = [ 

250 ("pdf", (name, data, 'application/pdf')), 

251 ] 

252 response = requests.post(f"{COSMOS_ADDRESS}/process/", files=files) 

253 

254 if response.status_code == status.HTTP_202_ACCEPTED: 

255 

256 callback_endpoints = response.json() 

257 

258 for retry_num in range(200): 

259 time.sleep(3) # Retry in ten seconds 

260 poll = requests.get(f"{callback_endpoints['status_endpoint']}") 

261 if poll.status_code == status.HTTP_200_OK: 

262 poll_results = poll.json() 

263 # If the job is completed, fetch the results 

264 if poll_results['job_completed']: 

265 cosmos_response = requests.get(f"{callback_endpoints['result_endpoint']}") 

266 if cosmos_response.status_code == status.HTTP_200_OK: 

267 data = cosmos_response.content 

268 with ZipFile(io.BytesIO(data)) as z: 

269 for file in z.namelist(): 

270 if file.endswith(".parquet") and \ 

271 not file.endswith("_figures.parquet") and \ 

272 not file.endswith("_pdfs.parquet") and \ 

273 not file.endswith("_tables.parquet") and \ 

274 not file.endswith("_sections.parquet") and \ 

275 not file.endswith("_equations.parquet"): 

276 # convert parquet to json 

277 with z.open(file) as zf: 

278 json_data = parquet_to_json(zf) 

279 return json_data 

280 # Shouldn't reach this point 

281 raise RuntimeError("COSMOS data doesn't include document file for annotation") 

282 

283 else: 

284 raise RuntimeError( 

285 f"COSMOS Result Error - STATUS CODE: {response.status_code} - {COSMOS_ADDRESS}") 

286 # If not, just wait until the next iteration 

287 else: 

288 pass 

289 

290 # If we reached this point, we time out 

291 raise TimeoutError(f"Timed out waiting for COSMOS on retry num {retry_num + 1}") 

292 

293 else: 

294 raise RuntimeError(f"COSMOS Error - STATUS CODE: {response.status_code} - {COSMOS_ADDRESS}") 

295 

296 

297def merge_pipelines_results( 

298 skema_extractions, 

299 mit_extractions, 

300 general_skema_error, 

301 general_mit_error, 

302 annotate_skema, 

303 annotate_mit): 

304 """ Merges and de-duplicates text extractions from pipelines""" 

305 

306 # Build the generalized errors list 

307 generalized_errors = list() 

308 if general_skema_error: 

309 generalized_errors.append( 

310 TextReadingError( 

311 pipeline="SKEMA", 

312 message=general_skema_error 

313 ) 

314 ) 

315 if general_mit_error: 

316 generalized_errors.append( 

317 TextReadingError( 

318 pipeline="MIT", 

319 message=general_mit_error 

320 ) 

321 ) 

322 

323 # Build the results and input-specific errors 

324 results = list() 

325 errors = list() 

326 assert len(skema_extractions) == len( 

327 mit_extractions 

328 ), "Both pipeline results lists should have the same length" 

329 for skema, mit in zip(skema_extractions, mit_extractions): 

330 if annotate_skema and isinstance(skema, str): 

331 errors.append(TextReadingError(pipeline="SKEMA", message=skema)) 

332 skema = None 

333 

334 if annotate_mit and isinstance(mit, str): 

335 errors.append(TextReadingError(pipeline="MIT", message=mit)) 

336 mit = None 

337 

338 normalized = normalize_extractions( 

339 arizona_extractions=skema, mit_extractions=mit 

340 ) 

341 results.append( 

342 TextReadingDocumentResults( 

343 data=normalized if normalized.attributes else None, 

344 errors=errors if errors else None, 

345 ) 

346 ) 

347 

348 return TextReadingAnnotationsOutput( 

349 outputs=results, 

350 generalized_errors=generalized_errors if generalized_errors else None 

351 ) 

352 

353 

354def integrated_extractions( 

355 response: Response, 

356 skema_annotator: Callable, 

357 skema_inputs: List[Union[str, List[Dict]]], 

358 mit_inputs: List[str], 

359 annotate_skema: bool = True, 

360 annotate_mit: bool = True, 

361) -> TextReadingAnnotationsOutput: 

362 """ 

363 Run both text extractors and merge the results. 

364 This is the annotation logic shared between different input formats 

365 """ 

366 

367 # Initialize the extractions to an empty list of arrays 

368 skema_extractions = [[] for t in skema_inputs] 

369 mit_extractions = [[] for t in mit_inputs] 

370 skema_error = None 

371 mit_error = None 

372 

373 if annotate_skema: 

374 try: 

375 skema_extractions = skema_annotator(skema_inputs) 

376 except Exception as ex: 

377 skema_error = f"Problem annotating with SKEMA: {ex}" 

378 

379 if annotate_mit: 

380 try: 

381 mit_extractions = annotate_text_with_mit(mit_inputs) 

382 except Exception as ex: 

383 mit_error = f"Problem annotating with MIT: {ex}" 

384 

385 return_val = merge_pipelines_results( 

386 skema_extractions, 

387 mit_extractions, 

388 skema_error, 

389 mit_error, 

390 annotate_skema, 

391 annotate_mit 

392 ) 

393 

394 # If there is any error, set the response's status code to 207 

395 if skema_error or mit_error or any(o.errors is not None for o in return_val.outputs): 

396 response.status_code = status.HTTP_207_MULTI_STATUS 

397 

398 return return_val 

399 

400 

401# End utility code for the endpoints 

402 

403 

404@router.post( 

405 "/integrated-text-extractions", 

406 summary="Posts one or more plain text documents and annotates with SKEMA and/or MIT text reading pipelines", 

407 status_code=200 

408) 

409async def integrated_text_extractions( 

410 response: Response, 

411 inputs: TextReadingInputDocuments, 

412 annotate_skema: bool = True, 

413 annotate_mit: bool = True, 

414) -> TextReadingAnnotationsOutput: 

415 """ 

416 ### Python example 

417 ``` 

418 params = { 

419 "annotate_skema": True, 

420 "annotate_mit": True 

421 

422 } 

423 payload = {"texts": [file_text], "amrs": [amr_text]} 

424 

425 response = requests.post(f"{URL}/text-reading/integrated-text-extractions", params=params, json=payload) 

426 if response.status_code == 200: 

427 data = response.json() 

428 ``` 

429 """ 

430 # Get the input plain texts 

431 texts = inputs.texts 

432 

433 amrs = inputs.amrs 

434 

435 # Run the text extractors 

436 extractions = integrated_extractions( 

437 response, 

438 annotate_text_with_skema, 

439 texts, 

440 texts, 

441 annotate_skema, 

442 annotate_mit 

443 ) 

444 

445 # Do the alignment 

446 aligned_amrs = list() 

447 if len(amrs) > 0: 

448 # Build an UploadFile instance from the extractions 

449 json_extractions = extractions.model_dump_json() 

450 extractions_ufile = UploadFile(file=io.BytesIO(json_extractions.encode('utf-8'))) 

451 for amr in amrs: 

452 # amr = json.loads(amr) 

453 amr_ufile = UploadFile(file=io.BytesIO(amr.encode('utf-8'))) 

454 try: 

455 aligned_amr = metal_proxy.link_amr( 

456 amr_file=amr_ufile, 

457 text_extractions_file=extractions_ufile) 

458 aligned_amrs.append(aligned_amr) 

459 except Exception as e: 

460 error = TextReadingError(pipeline="AMR Linker", message=f"Error annotating amr: {e}") 

461 if extractions.generalized_errors is None: 

462 extractions.generalized_errors = [error] 

463 else: 

464 extractions.generalized_errors.append(error) 

465 

466 extractions.aligned_amrs = aligned_amrs 

467 

468 return extractions 

469 

470 

471@router.post( 

472 "/integrated-pdf-extractions", 

473 summary="Posts one or more pdf documents and annotates with SKEMA and/or MIT text reading pipelines", 

474 status_code=200 

475) 

476async def integrated_pdf_extractions( 

477 response: Response, 

478 pdfs: List[UploadFile], 

479 amrs: List[UploadFile] = [], 

480 annotate_skema: bool = True, 

481 annotate_mit: bool = True 

482) -> TextReadingAnnotationsOutput: 

483 """ 

484 

485 ### Python example 

486 ``` 

487 params = { 

488 "annotate_skema":True, 

489 "annotate_mit": True 

490 } 

491 

492 files = [("pdfs", ("ijerp.pdf", open("ijerp.pdf", "rb"))), ("amrs", ("amr.json", open("amr.json", "rb")))] 

493 

494 response = request.post(f"{URL}/text-reading/integrated-pdf-extractions", params=params, files=files) 

495 if response.status_code == 200: 

496 data = response.json() 

497 ``` 

498 """ 

499 # TODO: Make this handle multiple pdf files in parallel 

500 # Call COSMOS on the pdfs 

501 cosmos_data = list() 

502 for pdf in pdfs: 

503 if pdf.filename.endswith("json"): 

504 json_data = json.load(pdf.file) 

505 else: 

506 json_data = cosmos_client(pdf.filename, pdf.file) 

507 cosmos_data.append(json_data) 

508 

509 # Get the plain text version from cosmos, passed through to MIT pipeline 

510 plain_texts = ['\n'.join(block['content'] for block in c) for c in cosmos_data] 

511 

512 # Run the text extractors 

513 extractions = integrated_extractions( 

514 response, 

515 annotate_pdfs_with_skema, 

516 cosmos_data, 

517 plain_texts, 

518 annotate_skema, 

519 annotate_mit 

520 ) 

521 

522 # Do the alignment 

523 aligned_amrs = list() 

524 if len(amrs) > 0: 

525 # Build an UploadFile instance from the extractions 

526 json_extractions = extractions.model_dump_json() 

527 extractions_ufile = UploadFile(file=io.BytesIO(json_extractions.encode('utf-8'))) 

528 for amr in amrs: 

529 try: 

530 aligned_amr = metal_proxy.link_amr( 

531 amr_file=amr, 

532 text_extractions_file=extractions_ufile) 

533 aligned_amrs.append(aligned_amr) 

534 except Exception as e: 

535 error = TextReadingError(pipeline="AMR Linker", message=f"Error annotating {amr.filename}: {e}") 

536 if extractions.generalized_errors is None: 

537 extractions.generalized_errors = [error] 

538 else: 

539 extractions.generalized_errors.append(error) 

540 

541 extractions.aligned_amrs = aligned_amrs 

542 

543 return extractions 

544 

545 

546# These are some direct proxies to the SKEMA and MIT APIs 

547@router.post( 

548 "/cosmos_to_json", 

549 status_code=200, 

550) 

551async def cosmos_to_json(pdf: UploadFile) -> List[Dict]: 

552 """ Calls COSMOS on a pdf and converts the data into json 

553 

554 ### Python example 

555 ``` 

556 response = requests.post(f"{endpoint}/text-reading/cosmos_to_json", 

557 files=[ 

558 ("pdf", ("ijerp.pdf", open("ijerph-18-09027.pdf", 'rb'))) 

559 ] 

560 ) 

561 ``` 

562 """ 

563 return cosmos_client(pdf.filename, pdf.file) 

564 

565 

566@router.post( 

567 "/ground_to_mira", 

568 status_code=200, 

569 response_model=List[List[MiraGroundingOutputItem]] 

570) 

571async def ground_to_mira(k: int, queries: MiraGroundingInputs, response: Response) -> List[ 

572 List[MiraGroundingOutputItem]]: 

573 """ Proxy to the MIRA grounding functionality on the SKEMA TR service 

574 

575 ### Python example 

576 ``` 

577 queries = {"queries": ["infected", "suceptible"]} 

578 params = {"k": 5} 

579 response = requests.post(f"{endpoint}/text-reading/ground_to_mira", params=params, json=queries) 

580 

581 if response.status_code == 200: 

582 results = response.json() 

583 ``` 

584 """ 

585 params = { 

586 "k": k 

587 } 

588 headers = { 

589 "Content-Type": "text/plain" 

590 } 

591 payload = "\n".join(queries.queries) 

592 inner_response = requests.post(f"{SKEMA_TR_ADDRESS}/groundStringsToMira", headers=headers, params=params, 

593 data=payload) 

594 

595 response.status_code = inner_response.status_code 

596 

597 if inner_response.status_code == 200: 

598 return [[MiraGroundingOutputItem(**o) for o in q] for q in inner_response.json()] 

599 else: 

600 return inner_response.content 

601 

602 

603@router.post("/cards/get_model_card") 

604async def get_model_card(text_file: UploadFile, code_file: UploadFile, response: Response): 

605 """ Calls the model card endpoint from MIT's pipeline 

606 

607 ### Python example 

608 ``` 

609 files = { 

610 "text_file": ('text_file.txt", open("text_file.txt", 'rb')), 

611 "code_file": ('code_file.py", open("code_file.py", 'rb')), 

612 } 

613 

614 response = requests.post(f"{endpoint}/text-reading/cards/get_model_card", files=files) 

615 ``` 

616 """ 

617 

618 params = { 

619 "gpt_key": OPENAI_KEY, 

620 } 

621 files = { 

622 "text_file": (text_file.filename, text_file.file, "text/plain"), 

623 "code_file": (code_file.filename, code_file.file, "text/plain") 

624 } 

625 

626 inner_response = requests.post(f"{MIT_TR_ADDRESS}/cards/get_model_card", params=params, files=files) 

627 

628 response.status_code = inner_response.status_code 

629 return inner_response.json() 

630 

631 

632@router.post("/cards/get_data_card") 

633async def get_data_card(smart: bool, csv_file: UploadFile, doc_file: UploadFile, response: Response): 

634 """ 

635 Calls the data card endpoint from MIT's pipeline. 

636 Smart run provides better results but may result in slow response times as a consequence of extra GPT calls. 

637 

638 ### Python example 

639 ``` 

640 params = { 

641 "smart": False 

642 } 

643 

644 files = { 

645 "csv_file": ('csv_file.csv", open("csv_file.csv", 'rb')), 

646 "doc_file": ('doc_file.txt", open("doc_file.txt", 'rb')), 

647 } 

648 

649 response = requests.post(f"{endpoint}/text-reading/cards/get_data_card", params=params files=files) 

650 ``` 

651 """ 

652 

653 params = { 

654 "gpt_key": OPENAI_KEY, 

655 "smart": smart 

656 } 

657 files = { 

658 "csv_file": (csv_file.filename, csv_file.file, "text/csv"), 

659 "doc_file": (doc_file.filename, doc_file.file, "text/plain") 

660 } 

661 

662 inner_response = requests.post(f"{MIT_TR_ADDRESS}/cards/get_data_card", params=params, files=files) 

663 

664 response.status_code = inner_response.status_code 

665 return inner_response.json() 

666 

667 

668#### 

669 

670 

671@router.get( 

672 "/healthcheck", 

673 summary="Check health of integrated text reading service", 

674 response_model=int, 

675 status_code=200, 

676 responses={ 

677 200: { 

678 "model": int, 

679 "description": "All component services are healthy (200 status)", 

680 }, 

681 500: { 

682 "model": int, 

683 "description": "Internal error occurred", 

684 "example_value": 500 

685 }, 

686 502: { 

687 "model": int, 

688 "description": "Either text reading service is not available" 

689 } 

690 

691 }, 

692) 

693def healthcheck() -> int: 

694 # SKEMA health check 

695 skema_endpoint = f"{SKEMA_TR_ADDRESS}/api/skema" 

696 try: 

697 skema_response = requests.get(skema_endpoint, timeout=10) 

698 except Exception: 

699 return status.HTTP_500_INTERNAL_SERVER_ERROR 

700 

701 # TODO replace this with a proper healthcheck endpoint 

702 mit_endpoint = f"{MIT_TR_ADDRESS}/annotation/find_text_vars/" 

703 mit_params = {"gpt_key": OPENAI_KEY} 

704 files = {"file": io.StringIO("x = 0")} 

705 try: 

706 mit_response = requests.post(mit_endpoint, params=mit_params, files=files, timeout=10) 

707 except Exception: 

708 return status.HTTP_502_BAD_GATEWAY 

709 ###################################################### 

710 

711 status_code = ( 

712 status.HTTP_200_OK 

713 if all(resp.status_code == 200 for resp in [skema_response, mit_response]) 

714 else status.HTTP_500_INTERNAL_SERVER_ERROR 

715 ) 

716 return status_code 

717 

718 

719@router.post("/eval", response_model=TextReadingEvaluationResults, status_code=200) 

720def quantitative_eval(extractions_file: UploadFile, 

721 gt_annotations: UploadFile, json_text: UploadFile) -> TextReadingEvaluationResults: 

722 """ 

723 # Gets performance metrics of a set of text extractions against a ground truth annotations file. 

724 

725 ## Example: 

726 ```python 

727 files = { 

728 "extractions_file": ("paper_variable_extractions.json", open("paper_variable_extractions.json", 'rb')), 

729 "gt_annotations": ("paper_gt_annotations.json", open("paper_gt_annotations.json", 'rb')), 

730 "json_text": ("paper_cosmos_output.json", open("paper_cosmos_output.json", 'rb')), 

731 } 

732 

733 response = requests.post(f"{endpoint}/text-reading/eval", files=files) 

734 ``` 

735 

736 """ 

737 

738 gt_data = json.load(gt_annotations.file) 

739 json_contents = json.load(json_text.file) 

740 

741 # Support both Attribute Collections serialized and within the envelope of this rest API 

742 extractions_json = json.load(extractions_file.file) 

743 try: 

744 extractions = AttributeCollection.from_json(extractions_json) 

745 except KeyError: 

746 extractions_file.file.seek(0) 

747 service_output = json.load(extractions_file.file) 

748 collections = list() 

749 for collection in service_output['outputs']: 

750 collection = AttributeCollection.from_json(collection['data']) 

751 collections.append(collection) 

752 

753 extractions = AttributeCollection( 

754 attributes=list(it.chain.from_iterable(c.attributes for c in collections))) 

755 

756 return utils.compute_text_reading_evaluation(gt_data, extractions, json_contents) 

757 

758 

759app = FastAPI() 

760app.include_router(router)