@@ -306,8 +306,27 @@ def process_sentence_file(dataset_fold, dataset_file, json_file, target_file):
306306 for l in file_lines :
307307 new_file .write (l )
308308
309+ def process_token_file (dataset_fold , dataset_file , sentence_json_file , token_edits_json_file , target_file ):
310+ with open (token_edits_json_file ) as f :
311+ edits = pd .read_json (f );
312+ edits = edits [edits .fold == dataset_fold ]; # select only correct fold
313+ with open (sentence_json_file ) as f :
314+ sentence_deletes = json .load (f );
315+ with open (dataset_file , "r" ) as source_file :
316+ file_lines = source_file .readlines ()
317+
318+ removed = 0
319+ for l in range (0 , edits .index .max ()):
320+ if l in sentence_deletes [dataset_fold ]:
321+ removed += 1
322+ if l in edits .index :
323+ file_lines [l - removed ] = edits .at [l ,'correct_line' ]
324+ with open (target_file , "w+" ) as new_file :
325+ for l in file_lines :
326+ new_file .write (l )
327+
309328
310- def apply_corrections (data_set_info , label_csv_file , sentence_json_file , target_dir = None , corpus_fold = None ):
329+ def apply_corrections (data_set_info , label_csv_file , sentence_json_file ,token_edits_json_file , target_dir = None , corpus_fold = None ):
311330 """
312331 Applies label and sentence boundary corrections
313332 :param data_set_info: Dictionary containing a mapping from fold name to file name for
@@ -333,7 +352,10 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
333352 process_label_file (fold , fold_file , label_csv_file , None , temp_file .name )
334353
335354 logging .info ("Correcting sentence boundaries for fold '{}'" .format (fold ))
336- process_sentence_file (fold , temp_file .name , sentence_json_file , target_file )
355+ process_sentence_file (fold , temp_file .name , sentence_json_file , temp_file .name )
356+
357+ logging .info ("Correcting token errors for fold'{}'" .format (fold ))
358+ process_token_file (fold ,temp_file .name ,sentence_json_file , token_edits_json_file ,target_file )
337359
338360 logging .info ("Corrected corpus fold '{}' to file: '{}'" .format (fold , target_file ))
339361
@@ -358,6 +380,10 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
358380 default = os .path .join ("corrected_labels" ,
359381 "sentence_corrections.json" ))
360382
383+ parser .add_argument ("--token_corrections_file" , type = str ,
384+ default = os .path .join ("corrected_labels" ,
385+ "token_corrections.json" ))
386+
361387 parser .add_argument ("--corpus_fold" , type = str ,
362388 help = "Correct only a specific fold of the corpus if specified as "
363389 "[train|dev|test], otherwise with correct the entire corpus" )
@@ -375,6 +401,7 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
375401 "original_corpus_dir" : args .original_corpus_dir ,
376402 "corrected_corpus_dir" : args .original_corpus_dir ,
377403 "label_corrections_file" : args .label_corrections_file ,
404+ "token_corrections_file" : args .token_corrections_file ,
378405 "sentence_boundary_corrections_file" : args .sentence_boundary_corrections_file ,
379406 }
380407
@@ -387,6 +414,7 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
387414 get_or_download_corpus (target_dir = args .original_corpus_dir ),
388415 args .label_corrections_file ,
389416 args .sentence_boundary_corrections_file ,
417+ args .token_corrections_file ,
390418 args .corrected_corpus_dir ,
391419 args .corpus_fold
392420 )
0 commit comments