11import glob
22import os
33import pandas as pd
4- from datetime import datetime
54import argparse
65from datetime import datetime
76
@@ -111,7 +110,8 @@ def update_processed_database(input_path: str):
111110 or line .find ("paper has been processed" ) != - 1
112111 ):
113112 if uuid in df ["uuid" ].values :
114- if df [df ["uuid" ] == uuid ]["status" ] == "success" :
113+ index = df [df ["uuid" ] == uuid ].index [0 ]
114+ if df .loc [index , "status" ] == "success" :
115115 continue
116116 df .loc [df ["uuid" ] == uuid , "status" ] = "success"
117117 df .loc [df ["uuid" ] == uuid , "end_time" ] = current_time
@@ -120,7 +120,8 @@ def update_processed_database(input_path: str):
120120 # failed to process file, update status and eror information
121121 if line .find ("message: " ) != - 1 :
122122 if uuid in df ["uuid" ].values :
123- if df [df ["uuid" ] == uuid ]["status" ] == "failure" :
123+ index = df [df ["uuid" ] == uuid ].index [0 ]
124+ if df .loc [index , "status" ] == "failure" :
124125 continue
125126 error_type = line .split ("type: " )[1 ].split (", " )[0 ]
126127 error_info = line .split ("message: " )[1 ].strip ()
@@ -154,6 +155,9 @@ def update_processed_database(input_path: str):
154155 ]
155156 df .loc [index , "overlap" ] = quality_report ["page_quality" ][- 1 ]["ratio" ]
156157
158+ for category_item in quality_report ["category_quality" ]:
159+ df .loc [index , category_item ["category" ]] = category_item ["geometry_count" ]
160+
157161 # remove processing files
158162 df = df [~ (df ["status" ] == "processing" )]
159163 df .to_csv (database_file , index = False )
@@ -169,13 +173,6 @@ def update_discpline_info():
169173 with open (log_file ) as f :
170174 lines = f .readlines ()
171175 for line in lines :
172- if line .find ("[VRDU] Before filtering" ) != - 1 :
173- processable_files = int (line .split ("found " )[1 ].split (" " )[0 ])
174- log .debug (
175- f"discpline: { discpline } , processable files: { processable_files } "
176- )
177- df .loc [df ["discpline" ] == discpline , "num_papers" ] = processable_files
178-
179176 if line .find ("finished processing." ) != - 1 :
180177 df .loc [df ["discpline" ] == discpline , "status" ] = "complete"
181178 else :
@@ -238,7 +235,7 @@ def update_daily_overview() -> None:
238235
239236def main ():
240237 parser = argparse .ArgumentParser ()
241- parser .add_argument ("--input_path" , type = str , default = "data /" )
238+ parser .add_argument ("--input_path" , type = str , default = "output /" )
242239 args = parser .parse_args ()
243240
244241 update_processed_database (args .input_path )
0 commit comments