Skip to content

Commit f3684a0

Browse files
authored
Patching all_conll_corrections_combined to remove errors and warnings (#26)
* Fixed all combined corrections to get rid of I-O warnings * Reapply corrections for OCASEK and T&N errors from #3 * Fixed order of fields in 'Mariner Darren Bragg' correction * Fixed some formatting and output in script
1 parent 139ece6 commit f3684a0

2 files changed

Lines changed: 18 additions & 21 deletions

File tree

corrected_labels/all_conll_corrections_combined.csv

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,8 @@
6969
204,dev,39,"[11, 23): 'Boxing-Bruno'",MISC,Token,"[18, 23): 'Bruno'",PER,17,,False,True,True,False
7070
205,dev,39,,MISC,Token,"[18, 23): 'Bruno' ",PER,,"Tokenizer treats ""Boxing-Bruno"" as one token",False,True,False,True
7171
210,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[476, 500): 'Driefontein Consolidated'",ORG,17,,True,True,True,False
72-
211,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[505, 516): 'Gold Fields'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
73-
212,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields' Kloof Gold Mining Co'",ORG,Span,"[505, 516): 'Gold Fields''",ORG,,Two separate companies,False,True,False,True
74-
213,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[519, 539): 'Kloof Gold Mining Co'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
75-
214,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields' Kloof Gold Mining Co'",ORG,Span,"[519, 539): 'Kloof Gold Mining Co'",,,Two companies that own a third company,False,True,False,True
72+
211,dev,42,,,Missing,"[505, 516): 'Gold Fields'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
73+
213,dev,42,,,Missing,"[519, 539): 'Kloof Gold Mining Co'",ORG,16,Description of two companies jointly owning a third company,True,False,False,False
7674
248,dev,59,,,Missing,"[2024, 2029): 'Comex'",MISC,,"""New York's Comex market""",False,False,True,True
7775
251,dev,60,"[1358, 1371): 'Tripoli-based'",MISC,Token,"[1358, 1365): 'Tripoli'",LOC,15,,False,True,False,False
7876
257,dev,61,"[3878, 3888): 'Whitewater'",LOC,Tag,"[3878, 3888): 'Whitewater'",MISC,16,Event: whitewater scandal,False,True,False,False
@@ -240,10 +238,8 @@
240238
721,dev,155,"[19, 25): 'FRANCE'",LOC,Tag,"[19, 25): 'FRANCE'",ORG,13,,False,True,False,False
241239
731,dev,156,"[19, 25): 'FRANCE'",LOC,Tag,"[19, 25): 'FRANCE'",ORG,17,,False,True,False,False
242240
733,dev,157,"[39, 45): 'TURKEY'",LOC,Tag,"[39, 45): 'TURKEY'",ORG,17,,False,True,False,False
243-
737,dev,158,"[120, 128): 'division'",MISC,Span,"[114, 128): 'first division'",MISC,17,,False,True,False,False
244241
738,dev,158,,,Missing,"[42, 56): 'FIRST DIVISION'",MISC,,,False,False,False,True
245242
739,dev,158,"[120, 128): 'division'",MISC,Wrong,,,16, divisions of leagues not entities,True,False,False,False
246-
740,dev,159,"[114, 122): 'division'",MISC,Span,"[108, 122): 'first division'",MISC,17,,False,True,False,False
247243
741,dev,159,,,Missing,"[37, 51): 'FIRST DIVISION'",MISC,,,False,False,False,True
248244
742,dev,159,"[114, 122): 'division'",MISC,Wrong,,,16, divisions of leagues not entities,True,False,False,False
249245
743,dev,160,"[32, 38): 'TURKEY'",LOC,Tag,"[32, 38): 'TURKEY'",ORG,16,Zach: Soccer nat team,False,True,False,False
@@ -438,7 +434,6 @@
438434
1388,test,54,"[3421, 3428): 'Boxmeer'",PER,Token,"[3417, 3428): 'van Boxmeer'",,17,,False,True,False,False
439435
1389,test,54,"[3421, 3428): 'Boxmeer'",,Span,"[3417, 3428): 'van Boxmeer'",PER,9,,True,False,False,False
440436
1396,test,54,,,Token,"[?, 27): 'ZYWIEC'",ORG,,INTERVIEW-ZYWIEC�SEES NO BIG 97 NET RISE.,False,True,False,True
441-
1397,test,54,"[3224, 3230): 'Zywiec'",ORG,Wrong,,,17,Company name as first word of brand name; see next line,False,True,False,False
442437
1398,test,54,"[3231, 3241): 'Full Light'",MISC,Wrong,,,16,"See previous line – type of beer, brand",True,False,False,False
443438
1407,test,55,"[129, 134): 'Czech'",LOC,Tag,"[129, 134): 'Czech'",MISC,17,,False,True,False,False
444439
1411,test,56,"[11, 16): 'UK-US'",MISC,Token,"[11, 13): 'UK'",LOC,15,,False,True,False,False
@@ -908,7 +903,7 @@
908903
2621,train,346,"[243, 248): 'Lotte'",PER,Tag,"[243, 248): 'Lotte'",ORG,,Baseball team,False,False,True,True
909904
2622,train,346,"[243, 248): 'Lotte'",,Tag,"[243, 248): 'Lotte'",ORG,,Baseball team,False,True,False,True
910905
2623,train,347,"[81, 93): 'Major League'",MISC,Sentence,"[81, 102): 'Major League Baseball'",,,"Sentence boundary between ""League"" and ""Baseball""",False,True,False,True
911-
2624,train,349,"[591, 611): 'Mariner Darren Bragg'","[591, 598): 'Mariner'",Both,MISC,,,,False,False,False,True
906+
2624,train,349,"[591, 611): 'Mariner Darren Bragg'",,Both,"[591, 598): 'Mariner'",MISC,,,False,False,False,True
912907
2625,train,349,"[1001, 1010): 'Cleveland'",ORG,Tag,"[1001, 1010): 'Cleveland'",LOC,,"In Cleveland, Kevin Seitzer's two-out single�",False,True,True,True
913908
2626,train,349,"[591, 611): 'Mariner Darren Bragg'",,Span,"[599, 611): 'Darren Bragg'",,,,False,True,False,True
914909
2627,train,351,"[83, 95): 'Major League'",MISC,Sentence,"[83, 104): 'Major League Baseball'",,,"Sentence boundary between ""League"" and ""Baseball""",False,True,False,True
@@ -960,7 +955,7 @@
960955
2686,train,458,"[1294, 1300): 'Nicola'",MISC,Wrong,,,,,False,False,True,True
961956
2687,train,458,"[363, 366): 'Col'",LOC,Wrong,,,,"Abbreviation for ""Colonel"". Titles not considered entities; also tagged as LOC, which is doubly wrong",False,False,False,True
962957
2695,train,487,,,Missing,"[911, 916): 'NYMEX'",ORG,,,False,False,True,True
963-
2696,train,488,,,Missing,"[224, 257): 'OCASEK GOVERNMENT OFFICE BUILDING'",LOC,,https://das.ohio.gov/Divisions/General-Services/Properties-and-Facilities/Ocasek,False,False,False,True
958+
2696,train,492,,,Missing,"[224, 257): 'OCASEK GOVERNMENT OFFICE BUILDING'",LOC,,https://das.ohio.gov/Divisions/General-Services/Properties-and-Facilities/Ocasek,False,False,False,True
964959
2697,train,488,"[822, 826): 'Sask'",LOC,Sentence,"[822, 827): 'Sask.'",LOC,,Incorrect sentence boundary after period and before comma,False,False,True,True
965960
2698,train,488,"[822, 826): 'Sask'",LOC,Span,"[822, 827): 'Sask.'",,,Period immediately before comma,False,True,False,True
966961
2699,train,488,"[889, 893): 'Alta'",,Span,"[889, 894): 'Alta.'",,,,False,True,False,True
@@ -1145,7 +1140,7 @@
11451140
2975,train,872,"[395, 419): 'CLINTON AND CEDAR RAPIDS'",ORG,Both,"[407, 419): 'CEDAR RAPIDS'",LOC,,,False,False,False,True
11461141
2977,train,874,"[59, 65): 'RENNES'",ORG,Tag,"[59, 65): 'RENNES'",LOC,,,False,True,True,True
11471142
2980,train,893,"[124, 130): 'Fowler'",PER,Span,"[118, 130): 'Wyche Fowler'",,,,False,False,True,True
1148-
2981,train,893,"[124, 130): 'Fowler'",,Span,"[21, 24): 'T&N'",ORG,,,False,True,False,True
1143+
2981,train,893,"[124, 130): 'Fowler'",,Span,"[118, 130): 'Wyche Fowler'",ORG,,,False,True,False,True
11491144
2984,train,918,"[11, 24): 'INTERVIEW-T&N'",,Token,"[11, 24): 'INTERVIEW-T&N'",,,INTERVIEW-T&N' treated as a single token,False,True,False,True
11501145
2985,train,918,"[11, 24): 'INTERVIEW-T&N'",MISC,Token,"[21, 24): 'T&N'",ORG,,"Tokenizer treated ""INVERVIEW-T&N"" as a single token",False,False,True,True
11511146
2986,train,919,"[112, 119): 'Richter'",PER,Tag,"[112, 119): 'Richter'",MISC,,,False,False,True,True

scripts/download_and_correct_corpus.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
252252
print(f"Skip span error for {row['correct_span']}. Please correct it by hand.", file=sys.stderr)
253253
continue
254254
if isinstance(row['correct_ent_type'], float) and math.isnan(row['correct_ent_type']):
255-
print(f'[WARNING] correct ent type for line {index} are empty. Skipping...',
255+
print(f'[WARNING] correct ent type for line {index} are empty. row: {row}. Skipping...',
256256
file=sys.stderr)
257257
continue
258258
dataset.correct_missing(corpus_span, row['correct_ent_type'], int(row['doc_offset']))
@@ -279,7 +279,7 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
279279
dataset.correct_span(row['corpus_span'], row['correct_span'], int(row['doc_offset']))
280280
elif row['error_type'] == 'Both':
281281
if isinstance(row['correct_ent_type'], float) and math.isnan(row['correct_ent_type']):
282-
print(f'[WARNING] Correct_ent_type for line {index} is empty. Skipping...', file=sys.stderr)
282+
print(f'[WARNING] Correct_ent_type for line {index} is empty. row: {row}. Skipping...', file=sys.stderr)
283283
continue
284284
dataset.correct_tag(row['corpus_span'], row['correct_ent_type'], int(row['doc_offset']))
285285
dataset.correct_span(row['corpus_span'], row['correct_span'], int(row['doc_offset']))
@@ -306,34 +306,36 @@ def process_sentence_file(dataset_fold, dataset_file, json_file, target_file):
306306
for l in file_lines:
307307
new_file.write(l)
308308

309+
309310
def process_token_file(dataset_fold, dataset_file, sentence_json_file, token_edits_json_file, target_file):
310311
with open(token_edits_json_file) as f:
311-
edits = pd.read_json(f);
312-
edits = edits[edits.fold == dataset_fold]; # select only correct fold
313-
with open(sentence_json_file) as f:
314-
sentence_deletes = json.load(f);
312+
edits = pd.read_json(f)
313+
edits = edits[edits.fold == dataset_fold] # select only correct fold
314+
with open(sentence_json_file) as f:
315+
sentence_deletes = json.load(f)
315316
with open(dataset_file, "r") as source_file:
316317
file_lines = source_file.readlines()
317318

318319
removed = 0
319320
for l in range(0, edits.index.max()):
320321
if l in sentence_deletes[dataset_fold]:
321-
removed +=1
322-
if l in edits.index :
323-
file_lines[l-removed] = edits.at[l,'correct_line']
322+
removed += 1
323+
if l in edits.index:
324+
file_lines[l-removed] = edits.at[l, 'correct_line']
324325
with open(target_file, "w+") as new_file:
325326
for l in file_lines:
326327
new_file.write(l)
327328

328329

329-
def apply_corrections(data_set_info, label_csv_file, sentence_json_file,token_edits_json_file, target_dir=None, corpus_fold=None):
330+
def apply_corrections(data_set_info, label_csv_file, sentence_json_file, token_edits_json_file, target_dir=None, corpus_fold=None):
330331
"""
331332
Applies label and sentence boundary corrections
332333
:param data_set_info: Dictionary containing a mapping from fold name to file name for
333334
each of the three folds (`train`, `test`, `dev`) of the corpus.
334335
:param label_csv_file: CSV file containing the label corrections
335336
:param sentence_json_file: JSON file containing the sentence boundary corrections -- specifically the line numbers
336337
in each file to be deleted
338+
:param token_edits_json_file: JSON file containing token edit corrections.
337339
:param target_dir: (optional) Target directory for the corrected corpus or
338340
None for default of "corrected_corpus".
339341
:param corpus_fold: (optional) Apply corrections to a specific fold only, or None for
@@ -354,7 +356,7 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file,token_ed
354356
logging.info("Correcting sentence boundaries for fold '{}'".format(fold))
355357
process_sentence_file(fold, temp_file.name, sentence_json_file, temp_file.name)
356358

357-
logging.info("Correcting token errors for fold'{}'".format(fold))
359+
logging.info("Correcting token errors for fold '{}'".format(fold))
358360
process_token_file(fold,temp_file.name,sentence_json_file, token_edits_json_file,target_file)
359361

360362
logging.info("Corrected corpus fold '{}' to file: '{}'".format(fold, target_file))

0 commit comments

Comments
 (0)