Skip to content

Commit dd8f886

Browse files
authored
Merge pull request #36 from CODAIT/fix-labels-2
Adjust audited files so that we can generate all_conll_corrections_combined.csv automatically
2 parents d207c2a + 2785a8f commit dd8f886

23 files changed

Lines changed: 7294 additions & 3749 deletions

corrected_corpus/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
eng.*
2+

corrected_labels/all_conll_corrections_combined.csv

Lines changed: 625 additions & 389 deletions
Large diffs are not rendered by default.

corrected_labels/human_labels_audited/CoNLL_2_in_gold.csv

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,correct_ent_type,notes,time_started,time_stopped,time_elapsed
22
0,dev,2,"[25, 30): 'ASHES'",MISC,None,,,teams label as ORG ,,,
33
0,dev,15,"[15, 40): 'AMERICAN FOOTBALL-RANDALL'",MISC,Wrong,,, divisions of leagues not entities,,,
4-
,dev,15,"[41, 51): 'CUNNINGHAM'",PER,Token,"(33, 51]: 'RANDALL CUNNINGHAM'",,"need to split on '-' ""FOOTBALL-RANDALL""",,,
4+
,dev,15,"[41, 51): 'CUNNINGHAM'",PER,Token,"[33, 51): 'RANDALL CUNNINGHAM'",PER,"need to split on '-' ""FOOTBALL-RANDALL""",,,
55
0,dev,20,"[90, 96): 'Berlin'",MISC,Sentence,"[90, 107): 'Berlin Grand Prix'",,,,,
66
0,dev,22,"[213, 244): 'Solidarity Meeting for Sarajevo'",MISC,None,,,,,,
77
0,dev,22,"[826, 847): 'IAAF Grand Prix Final'",MISC,None,,,,,,
88
0,dev,34,"[2269, 2291): 'Jackson Hole symposium'",MISC,None,,,teams label as PER,,,
99
0,dev,38,"[624, 635): 'Chicago PMI'",MISC,None,,,Purchase Managers Index,,,
10-
0,dev,39,"[11, 23): 'Boxing-Bruno'",MISC,Span,"[18, 23): 'Bruno'",PER,TODO cleanup,,,
11-
0,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[476, 500): 'Driefontein Consolidated'",ORG,Description of two companies jointly owning a third company,,,
12-
0,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[505, 516): 'Gold Fields'",ORG,Description of two companies jointly owning a third company,,,
10+
0,dev,39,"[11, 23): 'Boxing-Bruno'",MISC,Token,"[18, 23): 'Bruno'",PER,Tokenizer treated 'Boxing-Bruno' as one token,,,
11+
,dev,42,,,Missing,"[476, 500): 'Driefontein Consolidated'",ORG,Description of two companies jointly owning a third company,,,
12+
,dev,42,,,Missing,"[505, 516): 'Gold Fields'",ORG,Description of two companies jointly owning a third company,,,
1313
0,dev,42,"[476, 539): 'Driefontein Consolidated and Gold Fields ' Kloof Gold Mining Co'",ORG,Span,"[519, 539): 'Kloof Gold Mining Co'",ORG,Description of two companies jointly owning a third company,,,
1414
0,dev,47,"[106, 122): 'Aerodrom Beograd'",LOC,None,,,"Is an airport, teams label as PER",,,
1515
0,dev,58,"[354, 356): 'T3'",LOC,None,,,is a prison in Phenon Phem,,,
1616
0,dev,60,"[1480, 1495): 'Jebel al-Akhdar'",LOC,None,,,"is a region, teams missed al-Akhdar",,,
17-
0,dev,65,"[1125, 1134): 'asset-St.'",MISC,Token,,,Asset-St. Treated as 1 token,,,
18-
0,dev,65,"[1135, 1140): 'Louis'",MISC,Sentence,"(1130, 1140]: 'St. Louis'",LOC,Sentence boundary btw St. and Louis,,,
17+
0,dev,65,"[1125, 1134): 'asset-St.'",MISC,Token,"[1131, 1140): 'St. Louis'",LOC,Sentence boundary and tokenization issue at the same place,,,
18+
0,dev,65,"[1135, 1140): 'Louis'",MISC,Sentence,"[1131, 1140): 'St. Louis'",LOC,Sentence boundary and tokenization issue at the same place,,,
1919
0,dev,66,"[1163, 1180): 'Pan Am flight 103'",MISC,None,,,"ambiguous, but was an event",,,
2020
0,dev,82,"[143, 145): 'MI'",LOC,None,,,,,,
2121
0,dev,85,"[1140, 1148): 'Indosuez'",ORG,None,,,teams label as LOC,,,
@@ -95,9 +95,9 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
9595
0,test,27,"[565, 573): 'X-DENVER'",MISC,Token,"[567, 573): 'DENVER'",ORG,"split on '-', X"" is an annotation""",,,
9696
0,test,27,"[889, 900): 'Y-GREEN BAY'",MISC,Token,"[891, 900): 'GREEN BAY'",ORG,"split on '-', Y"" is an annotation""",,,
9797
,test,27,"[410, 412): 'PA'",ORG,Wrong,,,Points Allowed,,,
98-
0,test,28,"[82, 90): 'National'",ORG,Sentence,"[82, 106): 'National Football League'",,Also sentence boundary after National,,,
99-
0,test,28,"[91, 99): 'Football'",LOC,Sentence,"[82, 106): 'National Football League'",ORG,,,,
100-
0,test,28,"[100, 106): 'League'",LOC,Sentence,"[82, 106): 'National Football League'",ORG,,,,
98+
0,test,28,"[82, 90): 'National'",ORG,Sentence,"[82, 106): 'National Football League'",ORG,Also entity boundary after National,,,
99+
0,test,28,"[91, 99): 'Football'",LOC,Tag,"[91, 99): 'Football'",ORG,,,,
100+
0,test,28,"[100, 106): 'League'",LOC,Tag,"[100, 106): 'League'",ORG,,,,
101101
0,test,29,"[25, 44): 'FOOTBALL-OHIO STATE'",MISC,Token,"[34, 44): 'OHIO STATE'",ORG,"Need to split on '-' ""FOOTBALL-OHIO""",,,
102102
0,test,29,"[47, 51): 'PACE'",PER,None,,,last name,,,
103103
0,test,29,"[65, 79): 'LOMBARDI AWARD'",MISC,None,,,Award,,,
@@ -123,9 +123,8 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
123123
0,test,49,"[57, 63): 'DURBAN'",PER,Tag,,LOC,city,,,
124124
,test,54,"[11, 27): 'INTERVIEW-ZYWIEC'",MISC,Token,"[21, 27): 'ZYWIEC'",ORG,Need to split on '-',,,
125125
0,test,54,"[1717, 1723): 'Okocim'",ORG,None,,,brewery,,,
126-
,test,54,"[3224, 3230): 'Zywiec'",ORG,Span,"[3224, 3241): 'Zywiec Full Light'",,,,,
127-
0,test,54,"[3231, 3241): 'Full Light'",MISC,Wrong,,,"See previous line – type of beer, brand",,,
128-
,test,63,"[19, 39): 'office-Conservatives' ",MISC,Token,"[26, 39): 'Conservatives' ",ORG,political party,,,
126+
,test,54,"[3224, 3230): 'Zywiec'",ORG,None,,,"Ambiguous: Is [3224, 3241): 'Zywiec Full Light' the brand, or is ""Full Light"" the brand?",,,
127+
0,test,54,"[3231, 3241): 'Full Light'",MISC,Tag,,ORG,"See previous line – type of beer, brand",,,
129128
0,test,63,"[148, 160): 'Conservative'",MISC,Tag,,ORG,political party,,,
130129
0,test,70,"[79, 93): 'Maritime Queen'",MISC,None,,,carrier,,,
131130
0,test,70,"[177, 197): 'New York Commodities'",ORG,Span,"[177, 202): 'New York Commodities Desk'",,"??? team labeled ""New York Commodities Desk""",,,
@@ -153,7 +152,7 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
153152
0,test,106,"[311, 338): 'Asia Rubber Markets meeting'",MISC,None,,,,,,
154153
0,test,108,"[59, 75): 'Haitham Haddadin'",ORG,Tag,,PER,looks to be the author,,,
155154
0,test,108,"[2392, 2415): 'Guardians of the Cedars'",ORG,None,,,a militia,,,
156-
0,test,112,"[174, 184): 'John Mills'",PER,Span,"[174, 187): 'John Mills Jr'",,,,,
155+
0,test,112,"[174, 184): 'John Mills'",PER,Sentence,"[174, 188): 'John Mills Jr.'",PER,"Wrong span, and incorrect sentence boundary after ""Jr.""",,,
157156
0,test,115,"[326, 341): 'Outagmie County'",LOC,None,,,,,,
158157
0,test,117,"[83, 88): 'NYMEX'",ORG,None,,,exchange,,,
159158
0,test,117,"[89, 98): 'Henry Hub'",LOC,None,,,gas pipeline,,,
@@ -202,20 +201,20 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
202201
0,test,183,"[18, 35): 'SKIING-GLADISHIVA'",MISC,Token,"[25, 35): 'GLADISHIVA'",PER,split on '-',,,
203202
0,test,185,"[233, 244): 'Lillehammer'",LOC,None,,,,,,
204203
0,test,186,"[432, 446): 'Ingeborg Helen'",PER,Span,"[432, 454): 'Ingeborg Helen Markein'",,,,,
205-
,test,186,"[533, 541): 'Florence' ",LOC,Wrong,,,,,,
206-
0,test,186,"[542, 549): 'Masnada'",PER,Span,"[533, 549): 'Florence Masnada' ",,,,,
204+
,test,186,"[533, 541): 'Florence'",LOC,Wrong,,,,,,
205+
0,test,186,"[542, 549): 'Masnada'",PER,Span,"[533, 549): 'Florence Masnada'",,,,,
207206
,test,186,"[1275, 1283): 'Florence'",LOC,Wrong,,,,,,
208-
0,test,186,"[1284, 1291): 'Masnada'",PER,Span,"[1275, 1291): 'Florence Masnada' ",,,,,
207+
0,test,186,"[1284, 1291): 'Masnada'",PER,Span,"[1275, 1291): 'Florence Masnada'",,,,,
209208
0,test,186,"[2292, 2304): 'Nation's Cup'",MISC,None,,,sporting event,,,
210-
0,test,190,"[11, 27): 'BOBSLEIGH-SHIMER'",MISC,Token,,,,,,
211-
,test,190,"[440, 445): 'Italy' ",LOC,Span,"[440, 447): 'Italy I' ",ORG,bobsled division,,,
209+
0,test,190,"[11, 27): 'BOBSLEIGH-SHIMER'",MISC,Token,"[21, 27): 'SHIMER'",PER,"Tokenizer treated ""BOBSLEIGH-SHIMER"" as a single token",,,
210+
,test,190,"[440, 445): 'Italy'",LOC,Span,"[440, 447): 'Italy I'",ORG,bobsled division,,,
212211
0,test,192,"[225, 232): 'Italy I'",ORG,None,,,bobsled division,,,
213212
0,test,192,"[364, 372): 'German I'",ORG,None,,,bobsled division,,,
214213
0,test,192,"[431, 444): 'Switzerland I'",ORG,None,,,bobsled division,,,
215214
0,test,192,"[566, 582): 'Czech Republic I'",ORG,None,,,bobsled division,,,
216215
0,test,192,"[776, 791): 'United States I'",ORG,None,,,bobsled division,,,
217-
0,test,199,"[27, 52): 'SCOTTISH PREMIER DIVISION'",MISC,Span,"[27, 35): 'SCOTTISH'",,,,,
218-
0,test,199,"[108, 124): 'Scottish premier'",MISC,Span,"[108, 116): 'Scottish'",,,,,
216+
0,test,199,"[27, 52): 'SCOTTISH PREMIER DIVISION'",MISC,Span,"[27, 35): 'SCOTTISH'",MISC,Divisions of leagues not considered entities,,,
217+
0,test,199,"[108, 124): 'Scottish premier'",MISC,Span,"[108, 116): 'Scottish'",MISC,Divisions of leagues not considered entities,,,
219218
0,test,205,"[627, 636): 'Wimbledon'",LOC,Tag,,ORG,soccer team,,,
220219
0,test,207,"[196, 210): 'Premier league'",MISC,None,,,,,,
221220
0,test,207,"[1041, 1047): 'Oxford'",LOC,Tag,,ORG,,,,
@@ -300,8 +299,8 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
300299
1,test,9,"[774, 785): 'Efes Pilsen'",ORG,None,,,team ,,,
301300
1,test,9,"[804, 814): 'Pau-Orthez'",ORG,None,,,team ,,,
302301
1,test,11,"[133, 150): 'Chapman Golf Club'",LOC,None,,,,,,
303-
,test,24,"[211, 226): 'National Hockey'",ORG,Sentence,,,,,,
304-
1,test,24,"[227, 233): 'League'",ORG,Wrong,"[211, 233): 'National Hockey League'",,,,,
302+
,test,24,"[211, 226): 'National Hockey'",ORG,Sentence,"[211, 233): 'National Hockey League'",ORG,,,,
303+
1,test,24,"[227, 233): 'League'",ORG,Sentence,"[211, 233): 'National Hockey League'",ORG,,,,
305304
1,test,25,"[47, 53): 'EAGLES'",ORG,None,,,,,,
306305
1,test,29,"[788, 802): 'Lombardi Award'",MISC,None,,,,,,
307306
1,test,38,"[1089, 1095): 'Oviedo'",ORG,None,,,"Ambiguous , seems to refer to team in city 'Oviedo Spain', Span in LOC",,,
@@ -323,19 +322,19 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
323322
1,test,108,"[1510, 1527): 'Christian-Shi'ite'",MISC,Span,"[1510, 1534): 'Christian-Shi'ite Moslem'",,all describes one force of people,,,
324323
,test,108,"[1528, 1534): 'Moslem'",Misc,Wrong,,,,,,
325324
1,test,111,"[739, 741): 'NJ'",LOC,None,,,,,,
326-
,test,114,"[11, 17): 'Iowa-S' ",LOC,Token,"[11, 15): 'Iowa' ",,split on '-',,,
327-
,test,114,"[18, 22): 'Minn'",LOC,Span,"[16, 22): 'S Minn'",,,,,
325+
,test,114,"[11, 17): 'Iowa-S'",LOC,Token,"[11, 15): 'Iowa'",LOC,split on '-',,,
326+
,test,114,"[18, 22): 'Minn'",LOC,Span,"[16, 22): 'S Minn'",LOC,,,,
328327
1,test,114,"[51, 61): 'sales-USDA'",MISC,Token,"[57, 61): 'USDA'",,split on '-',,,
329328
1,test,117,"[1338, 1343): 'NYMEX'",ORG,None,,,exchange,,,
330329
1,test,118,"[127, 136): 'St. Louis'",LOC,None,,,,,,
331330
1,test,118,"[535, 550): 'mid-Mississippi'",MISC,Tag,,LOC,location where barge is,,,
332-
,test,118,"[552, 560): 'McGregor' ",PER,Tag,,LOC,is a city on the Mississippi River,,,
333-
,test,118,"[776, 791): 'mid-Mississippi' ",MISC,Tag,,LOC,location where barge is,,,
331+
,test,118,"[552, 560): 'McGregor'",PER,Tag,,LOC,is a city on the Mississippi River,,,
332+
,test,118,"[776, 791): 'mid-Mississippi'",MISC,Tag,,LOC,location where barge is,,,
334333
1,test,119,"[368, 375): 'Chicago'",LOC,None,,,,,,
335334
1,test,122,"[522, 545): 'Glencoe Animal Hospital'",ORG,Tag,,LOC,specific hospitol,,,
336335
1,test,122,"[1583, 1594): 'Rottweilers'",MISC,None,,,,,,
337-
1,test,123,"[11, 17): 'Iowa-S'",LOC,Token,"[11, 15): 'Iowa' ",,split on '-',,,
338-
,test,123,"[18, 22): 'Minn'",LOC,Span,"[16, 22): 'S Minn'",,,,,
336+
1,test,123,"[11, 17): 'Iowa-S'",LOC,Token,"[11, 15): 'Iowa'",LOC,split on '-',,,
337+
,test,123,"[18, 22): 'Minn'",LOC,Token,"[16, 22): 'S Minn'",LOC,,,,
339338
1,test,134,"[62, 73): 'Lantau Peak'",MISC,Tag,,LOC,??? def a peak,,,
340339
1,test,137,"[269, 282): 'Saharan Blend'",MISC,None,,,type of crude oil,,,
341340
1,test,137,"[341, 354): 'Arabian Light'",MISC,None,,,type of crude oil,,,
@@ -346,13 +345,13 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
346345
1,test,152,"[556, 559): 'Let'",LOC,Wrong,,,Let's march together,,,
347346
1,test,152,"[1035, 1041): 'League'",LOC,Tag,,ORG,appears other times in doc,,,
348347
1,test,153,"[229, 238): 'Wednesday'",ORG,Wrong,,,day of the week,,,
349-
1,test,161,"[11, 24): 'John Lewis UK'",ORG,Span,"[11, 21): 'John Lewis'",,"??? division of company based in UK, should UK be separate?",,,
348+
1,test,161,"[11, 24): 'John Lewis UK'",ORG,Span,"[11, 21): 'John Lewis'",,John Lewis Partnership is a company that owns some stores in the UK,,,
350349
1,test,163,"[121, 133): 'Conservative'",MISC,Tag,,ORG,political party,,,
351350
,test,163,"[411, 424): 'Conservatives' ",,Missing,,ORG,political party,,,
352351
1,test,163,"[1365, 1377): 'Conservative'",MISC,,,,political party,,,
353352
1,test,176,"[43, 49): 'Busang'",LOC,None,,,,,,
354353
1,test,176,"[781, 787): 'Busang'",LOC,None,,,,,,
355-
,test,176,"[2419, 2425): 'Busang' ",ORG,Tag,,LOC,??? city in Indonesia that has big gold deposits,,,
354+
,test,176,"[2419, 2425): 'Busang'",ORG,Tag,,LOC,??? city in Indonesia that has big gold deposits,,,
356355
,test,176,"[2732, 2738): 'Busang'",ORG,Tag,,LOC,,,,
357356
1,test,176,"[2960, 2966): 'Busang'",ORG,Tag,,LOC,,,,
358357
1,test,178,"[951, 960): 'then-U.S.'",MISC,None,,,used to be a U.S. trade rep,,,
@@ -447,11 +446,11 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
447446
2,test,54,"[99, 105): 'Zywiec'",ORG,None,,,brewery,,,
448447
2,test,54,"[626, 632): 'Zywiec'",ORG,None,,,brewery,,,
449448
2,test,54,"[2966, 2972): 'Okocim'",ORG,None,,,another brewery,,,
450-
2,test,54,"[3224, 3230): 'Zywiec'",ORG,Span,"[3224, 3241): 'Zywiec Full Light'",,beer brand,,,
449+
2,test,54,"[3224, 3230): 'Zywiec'",ORG,None,,,,,,
451450
2,test,54,"[3522, 3528): 'Zywiec'",ORG,None,,,brewery,,,
452451
2,test,60,"[2313, 2329): 'London-to-Boston'",MISC,None,,,airplane route,,,
453452
2,test,63,"[11, 16): 'Major'",PER,None,,,last name,,,
454-
2,test,63,"[19, 39): 'office-Conservatives'",MISC,Token,"[26, 39): 'Conservatives' ",ORG,political party,,,
453+
2,test,63,"[19, 39): 'office-Conservatives'",MISC,Token,"[26, 39): 'Conservatives'",ORG,political party,,,
455454
2,test,67,"[682, 690): 'Manitoba'",ORG,None,,,pork company/council,,,
456455
2,test,75,"[207, 215): 'Santa Fe'",ORG,None,,,company name,,,
457456
2,test,75,"[455, 463): 'Santa Fe'",ORG,None,,,company name,,,
@@ -471,7 +470,7 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
471470
2,test,101,"[89, 102): 'East Timorese'",MISC,None,,,adj describing birth loc of person,,,
472471
2,test,104,"[2622, 2628): 'Yangon'",LOC,None,,,university,,,
473472
2,test,108,"[2380, 2391): 'pro-Israeli'",MISC,None,,,adj,,,
474-
2,test,114,"[11, 17): 'Iowa-S'",LOC,Token,"[11, 15): 'Iowa' ",,split on ‘-’,,,
473+
2,test,114,"[11, 17): 'Iowa-S'",LOC,Token,"[11, 15): 'Iowa'",LOC,split on ‘-’,,,
475474
2,test,117,"[1654, 1661): 'Permian'",MISC,None,,,"ambiguous could be a company or region, ref to natgas",,,
476475
2,test,118,"[776, 791): 'mid-Mississippi'",MISC,None,,,adj,,,
477476
2,test,119,"[11, 15): 'CBOT'",ORG,None,,,chicago board of trade,,,
@@ -11627,4 +11626,4 @@ num_models,fold,doc_offset,corpus_span,corpus_ent_type,error_type,correct_span,c
1162711626
16,test,230,"[1108, 1115): 'Germany'",LOC,,,,,,,
1162811627
16,test,230,"[1127, 1132): 'Irish'",MISC,,,,,,,
1162911628
16,test,230,"[1153, 1160): 'England'",LOC,,,,,,,
11630-
16,test,230,"[1252, 1259): 'England'",LOC,,,,,,,
11629+
16,test,230,"[1252, 1259): 'England'",LOC,,,,,,,

0 commit comments

Comments
 (0)