Skip to content

Commit 5652018

Browse files
authored
Merge pull request #35 from CODAIT/process-missing-later
Correct missing errors after span errors
2 parents cd0e001 + 3c9591a commit 5652018

1 file changed

Lines changed: 12 additions & 7 deletions

File tree

scripts/download_and_correct_corpus.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
237237
csv_patch = csv_patch[csv_patch['error_type'] != 'None']
238238

239239
dataset = Dataset(dataset_file)
240+
240241
for index, row in csv_patch.iterrows():
241242
# A lot of rows misplace correct_span and corpus_span. If corpus_span
242243
# is empty, use correct_span as the corpus_span for Missing, Tag, and Wrong.
@@ -247,13 +248,6 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
247248
file=sys.stderr)
248249
continue
249250

250-
if row['error_type'] == 'Missing':
251-
if isinstance(row['correct_ent_type'], float) and math.isnan(row['correct_ent_type']):
252-
print(f'[WARNING] correct ent type for line {index} are empty. row: {row}. Skipping...',
253-
file=sys.stderr)
254-
continue
255-
dataset.correct_missing(corpus_span, row['correct_ent_type'], int(row['doc_offset']))
256-
continue
257251
elif row['error_type'] == 'Tag':
258252
dataset.correct_tag(corpus_span, row['correct_ent_type'], int(row['doc_offset']))
259253
elif row['error_type'] == 'Wrong':
@@ -272,6 +266,17 @@ def process_label_file(dataset_fold, dataset_file, csv_patch_file, csv_encoding=
272266
dataset.correct_tag(row['corpus_span'], row['correct_ent_type'], int(row['doc_offset']))
273267
dataset.correct_span(row['corpus_span'], row['correct_span'], int(row['doc_offset']))
274268

269+
for index, row in csv_patch.iterrows():
270+
if row['error_type'] == 'Missing':
271+
if isinstance(row['correct_span'], float) and math.isnan(row['correct_span']):
272+
print(f'[WARNING] Correct span for line {index} is empty. Skipping...', file=sys.stderr)
273+
continue
274+
if isinstance(row['correct_ent_type'], float) and math.isnan(row['correct_ent_type']):
275+
print(f'[WARNING] correct ent type for line {index} are empty. row: {row}. Skipping...',
276+
file=sys.stderr)
277+
continue
278+
dataset.correct_missing(row['correct_span'], row['correct_ent_type'], int(row['doc_offset']))
279+
275280
result = dataset.save()
276281

277282
with open(target_file, mode="w") as f:

0 commit comments

Comments
 (0)