Patching all_conll_corrections_combined to remove errors and warnings (#26)

BryanCutler · web-flow · commit f3684a05ab18 · 2020-11-16T10:42:48.000-08:00
* Fixed all combined corrections to get rid of I-O warnings * Reapply corrections for OCASEK and T&N errors from #3 * Fixed order of fields in 'Mariner Darren Bragg' correction * Fixed some formatting and output in script
diff --git a/corrected_labels/all_conll_corrections_combined.csv b/corrected_labels/all_conll_corrections_combined.csv
@@ -69,10 +69,8 @@
 204,dev,39,"[11, 23): 'Boxing-Bruno'",MISC,Token,"[18, 23): 'Bruno'",PER,17,,False,True,True,False
 205,dev,39,,MISC,Token,"[18, 23): 'Bruno'	",PER,,"Tokenizer treats ""Boxing-Bruno"" as one token",False,True,False,True
 210,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[476, 500): 'Driefontein Consolidated'",ORG,17,,True,True,True,False
-211,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[505, 516): 'Gold Fields'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
-212,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields' Kloof Gold Mining Co'",ORG,Span,"[505, 516): 'Gold Fields''",ORG,,Two separate companies,False,True,False,True
-213,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[519, 539): 'Kloof Gold Mining Co'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
-214,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields' Kloof Gold Mining Co'",ORG,Span,"[519, 539): 'Kloof Gold Mining Co'",,,Two companies that own a third company,False,True,False,True
+211,dev,42,,,Missing,"[505, 516): 'Gold Fields'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
+213,dev,42,,,Missing,"[519, 539): 'Kloof Gold Mining Co'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
 248,dev,59,,,Missing,"[2024, 2029): 'Comex'",MISC,,"""New York's Comex market""",False,False,True,True
 251,dev,60,"[1358, 1371): 'Tripoli-based'",MISC,Token,"[1358, 1365): 'Tripoli'",LOC,15,,False,True,False,False
 257,dev,61,"[3878, 3888): 'Whitewater'",LOC,Tag,"[3878, 3888): 'Whitewater'",MISC,16,Event: whitewater scandal,False,True,False,False
@@ -240,10 +238,8 @@
 721,dev,155,"[19, 25): 'FRANCE'",LOC,Tag,"[19, 25): 'FRANCE'",ORG,13,,False,True,False,False
 731,dev,156,"[19, 25): 'FRANCE'",LOC,Tag,"[19, 25): 'FRANCE'",ORG,17,,False,True,False,False
 733,dev,157,"[39, 45): 'TURKEY'",LOC,Tag,"[39, 45): 'TURKEY'",ORG,17,,False,True,False,False
-737,dev,158,"[120, 128): 'division'",MISC,Span,"[114, 128): 'first division'",MISC,17,,False,True,False,False
 738,dev,158,,,Missing,"[42, 56): 'FIRST DIVISION'",MISC,,,False,False,False,True
 739,dev,158,"[120, 128): 'division'",MISC,Wrong,,,16, divisions of leagues not entities,True,False,False,False
-740,dev,159,"[114, 122): 'division'",MISC,Span,"[108, 122): 'first division'",MISC,17,,False,True,False,False
 741,dev,159,,,Missing,"[37, 51): 'FIRST DIVISION'",MISC,,,False,False,False,True
 742,dev,159,"[114, 122): 'division'",MISC,Wrong,,,16, divisions of leagues not entities,True,False,False,False
 743,dev,160,"[32, 38): 'TURKEY'",LOC,Tag,"[32, 38): 'TURKEY'",ORG,16,Zach: Soccer nat team,False,True,False,False
@@ -438,7 +434,6 @@
 1388,test,54,"[3421, 3428): 'Boxmeer'",PER,Token,"[3417, 3428): 'van Boxmeer'",,17,,False,True,False,False
 1389,test,54,"[3421, 3428): 'Boxmeer'",,Span,"[3417, 3428): 'van Boxmeer'",PER,9,,True,False,False,False
 1396,test,54,,,Token,"[?, 27): 'ZYWIEC'",ORG,,INTERVIEW-ZYWIECï¿½SEES NO BIG 97 NET RISE.,False,True,False,True
-1397,test,54,"[3224, 3230): 'Zywiec'",ORG,Wrong,,,17,Company name as first word of brand name; see next line,False,True,False,False
 1398,test,54,"[3231, 3241): 'Full Light'",MISC,Wrong,,,16,"See previous line – type of beer, brand",True,False,False,False
 1407,test,55,"[129, 134): 'Czech'",LOC,Tag,"[129, 134): 'Czech'",MISC,17,,False,True,False,False
 1411,test,56,"[11, 16): 'UK-US'",MISC,Token,"[11, 13): 'UK'",LOC,15,,False,True,False,False
@@ -908,7 +903,7 @@
 2621,train,346,"[243, 248): 'Lotte'",PER,Tag,"[243, 248): 'Lotte'",ORG,,Baseball team,False,False,True,True
 2622,train,346,"[243, 248): 'Lotte'",,Tag,"[243, 248): 'Lotte'",ORG,,Baseball team,False,True,False,True
 2623,train,347,"[81, 93): 'Major League'",MISC,Sentence,"[81, 102): 'Major League Baseball'",,,"Sentence boundary between ""League"" and ""Baseball""",False,True,False,True
-2624,train,349,"[591, 611): 'Mariner Darren Bragg'","[591, 598): 'Mariner'",Both,MISC,,,,False,False,False,True
+2624,train,349,"[591, 611): 'Mariner Darren Bragg'",,Both,"[591, 598): 'Mariner'",MISC,,,False,False,False,True
 2625,train,349,"[1001, 1010): 'Cleveland'",ORG,Tag,"[1001, 1010): 'Cleveland'",LOC,,"In Cleveland, Kevin Seitzer's two-out singleï¿½",False,True,True,True
 2626,train,349,"[591, 611): 'Mariner Darren Bragg'",,Span,"[599, 611): 'Darren Bragg'",,,,False,True,False,True
 2627,train,351,"[83, 95): 'Major League'",MISC,Sentence,"[83, 104): 'Major League Baseball'",,,"Sentence boundary between ""League"" and ""Baseball""",False,True,False,True
@@ -960,7 +955,7 @@
 2686,train,458,"[1294, 1300): 'Nicola'",MISC,Wrong,,,,,False,False,True,True
 2687,train,458,"[363, 366): 'Col'",LOC,Wrong,,,,"Abbreviation for ""Colonel"". Titles not considered entities; also tagged as LOC, which is doubly wrong",False,False,False,True
 2695,train,487,,,Missing,"[911, 916): 'NYMEX'",ORG,,,False,False,True,True
-2696,train,488,,,Missing,"[224, 257): 'OCASEK GOVERNMENT OFFICE BUILDING'",LOC,,https://das.ohio.gov/Divisions/General-Services/Properties-and-Facilities/Ocasek,False,False,False,True
+2696,train,492,,,Missing,"[224, 257): 'OCASEK GOVERNMENT OFFICE BUILDING'",LOC,,https://das.ohio.gov/Divisions/General-Services/Properties-and-Facilities/Ocasek,False,False,False,True
 2697,train,488,"[822, 826): 'Sask'",LOC,Sentence,"[822, 827): 'Sask.'",LOC,,Incorrect sentence boundary after period and before comma,False,False,True,True
 2698,train,488,"[822, 826): 'Sask'",LOC,Span,"[822, 827): 'Sask.'",,,Period immediately before comma,False,True,False,True
 2699,train,488,"[889, 893): 'Alta'",,Span,"[889, 894): 'Alta.'",,,,False,True,False,True
@@ -1145,7 +1140,7 @@
 2975,train,872,"[395, 419): 'CLINTON AND CEDAR RAPIDS'",ORG,Both,"[407, 419): 'CEDAR RAPIDS'",LOC,,,False,False,False,True
 2977,train,874,"[59, 65): 'RENNES'",ORG,Tag,"[59, 65): 'RENNES'",LOC,,,False,True,True,True
 2980,train,893,"[124, 130): 'Fowler'",PER,Span,"[118, 130): 'Wyche Fowler'",,,,False,False,True,True
-2981,train,893,"[124, 130): 'Fowler'",,Span,"[21, 24): 'T&N'",ORG,,,False,True,False,True
+2981,train,893,"[124, 130): 'Fowler'",,Span,"[118, 130): 'Wyche Fowler'",ORG,,,False,True,False,True
 2984,train,918,"[11, 24): 'INTERVIEW-T&N'",,Token,"[11, 24): 'INTERVIEW-T&N'",,,INTERVIEW-T&N' treated as a single token,False,True,False,True
 2985,train,918,"[11, 24): 'INTERVIEW-T&N'",MISC,Token,"[21, 24): 'T&N'",ORG,,"Tokenizer treated ""INVERVIEW-T&N"" as a single token",False,False,True,True
 2986,train,919,"[112, 119): 'Richter'",PER,Tag,"[112, 119): 'Richter'",MISC,,,False,False,True,True
diff --git a/scripts/download_and_correct_corpus.py b/scripts/download_and_correct_corpus.py
@@ -252,7 +252,7 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
                 print(f"Skip span error for {row['correct_span']}. Please correct it by hand.", file=sys.stderr)
                 continue
             if isinstance(row['correct_ent_type'], float) and math.isnan(row['correct_ent_type']):
-                print(f'[WARNING] correct ent type for line {index} are empty. Skipping...',
+                print(f'[WARNING] correct ent type for line {index} are empty. row: {row}. Skipping...',
                       file=sys.stderr)
                 continue
             dataset.correct_missing(corpus_span, row['correct_ent_type'], int(row['doc_offset']))
@@ -279,7 +279,7 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
             dataset.correct_span(row['corpus_span'], row['correct_span'], int(row['doc_offset']))
         elif row['error_type'] == 'Both':
             if isinstance(row['correct_ent_type'], float) and math.isnan(row['correct_ent_type']):
-                print(f'[WARNING] Correct_ent_type for line {index} is empty. Skipping...', file=sys.stderr)
+                print(f'[WARNING] Correct_ent_type for line {index} is empty. row: {row}. Skipping...', file=sys.stderr)
                 continue
             dataset.correct_tag(row['corpus_span'], row['correct_ent_type'], int(row['doc_offset']))
             dataset.correct_span(row['corpus_span'], row['correct_span'], int(row['doc_offset']))
@@ -306,34 +306,36 @@ def process_sentence_file(dataset_fold, dataset_file, json_file, target_file):
         for l in file_lines:
             new_file.write(l)
 
+
 def process_token_file(dataset_fold, dataset_file, sentence_json_file, token_edits_json_file, target_file):
     with open(token_edits_json_file) as f:
-        edits = pd.read_json(f); 
-    edits = edits[edits.fold == dataset_fold]; # select only correct fold 
-    with open(sentence_json_file) as f: 
-        sentence_deletes = json.load(f);
+        edits = pd.read_json(f)
+    edits = edits[edits.fold == dataset_fold]  # select only correct fold
+    with open(sentence_json_file) as f:
+        sentence_deletes = json.load(f)
     with open(dataset_file, "r") as source_file:
         file_lines = source_file.readlines()
 
     removed = 0
     for l in range(0, edits.index.max()):
         if l in sentence_deletes[dataset_fold]:
-            removed +=1
-        if l in edits.index : 
-            file_lines[l-removed] = edits.at[l,'correct_line']
+            removed += 1
+        if l in edits.index:
+            file_lines[l-removed] = edits.at[l, 'correct_line']
     with open(target_file, "w+") as new_file:
         for l in file_lines:
             new_file.write(l)
 
 
-def apply_corrections(data_set_info, label_csv_file, sentence_json_file,token_edits_json_file, target_dir=None, corpus_fold=None):
+def apply_corrections(data_set_info, label_csv_file, sentence_json_file, token_edits_json_file, target_dir=None, corpus_fold=None):
     """
     Applies label and sentence boundary corrections
     :param data_set_info: Dictionary containing a mapping from fold name to file name for
      each of the three folds (`train`, `test`, `dev`) of the corpus.
     :param label_csv_file: CSV file containing the label corrections
     :param sentence_json_file: JSON file containing the sentence boundary corrections -- specifically the line numbers
      in each file to be deleted
+    :param token_edits_json_file: JSON file containing token edit corrections.
     :param target_dir: (optional) Target directory for the corrected corpus or
      None for default of "corrected_corpus".
     :param corpus_fold: (optional) Apply corrections to a specific fold only, or None for
@@ -354,7 +356,7 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file,token_ed
             logging.info("Correcting sentence boundaries for fold '{}'".format(fold))
             process_sentence_file(fold, temp_file.name, sentence_json_file, temp_file.name)
 
-            logging.info("Correcting token errors for fold'{}'".format(fold))
+            logging.info("Correcting token errors for fold '{}'".format(fold))
             process_token_file(fold,temp_file.name,sentence_json_file, token_edits_json_file,target_file)
 
         logging.info("Corrected corpus fold '{}' to file: '{}'".format(fold, target_file))