Skip to content

Commit e740f09

Browse files
committed
added token corrections to correct corpus.py
1 parent 579fb17 commit e740f09

2 files changed

Lines changed: 33 additions & 16 deletions

File tree

scripts/Hand_Token_relabeler.ipynb

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -121,17 +121,6 @@
121121
"corrections.head(curr_row).tail()"
122122
]
123123
},
124-
{
125-
"cell_type": "code",
126-
"execution_count": null,
127-
"metadata": {},
128-
"outputs": [],
129-
"source": [
130-
"with open(\"../corrected_labels/token_corrections.csv\",'w') as file:\n",
131-
" corrections.to_csv(file)\n",
132-
"print(\"printed to file\")"
133-
]
134-
},
135124
{
136125
"cell_type": "code",
137126
"execution_count": null,
@@ -143,7 +132,7 @@
143132
"# also remove non-filled out lines. \n",
144133
"corrections_preened = corrections.drop_duplicates([\"fold\",\"line_no\", \"doc_offset\"], ignore_index=True); \n",
145134
"corrections_preened = corrections_preened[corrections_preened[\"correct_line\"] != '']\n",
146-
"corrections_preened"
135+
"corrections_preened.set_index(\"line_no\", inplace = True)"
147136
]
148137
},
149138
{
@@ -152,8 +141,8 @@
152141
"metadata": {},
153142
"outputs": [],
154143
"source": [
155-
"with open(\"../corrected_labels/token_corrections.csv\",'w') as file:\n",
156-
" corrections_preened.to_csv(file)\n",
144+
"with open(\"../corrected_labels/token_corrections.json\",'w') as file:\n",
145+
" corrections_preened.to_json(file)\n",
157146
"print(\"printed to file\")"
158147
]
159148
},

scripts/download_and_correct_corpus.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,27 @@ def process_sentence_file(dataset_fold, dataset_file, json_file, target_file):
306306
for l in file_lines:
307307
new_file.write(l)
308308

309+
def process_token_file(dataset_fold, dataset_file, sentence_json_file, token_edits_json_file, target_file):
310+
with open(token_edits_json_file) as f:
311+
edits = pd.read_json(f);
312+
edits = edits[edits.fold == dataset_fold]; # select only correct fold
313+
with open(sentence_json_file) as f:
314+
sentence_deletes = json.load(f);
315+
with open(dataset_file, "r") as source_file:
316+
file_lines = source_file.readlines()
317+
318+
removed = 0
319+
for l in range(0, edits.index.max()):
320+
if l in sentence_deletes[dataset_fold]:
321+
removed +=1
322+
if l in edits.index :
323+
file_lines[l-removed] = edits.at[l,'correct_line']
324+
with open(target_file, "w+") as new_file:
325+
for l in file_lines:
326+
new_file.write(l)
327+
309328

310-
def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_dir=None, corpus_fold=None):
329+
def apply_corrections(data_set_info, label_csv_file, sentence_json_file,token_edits_json_file, target_dir=None, corpus_fold=None):
311330
"""
312331
Applies label and sentence boundary corrections
313332
:param data_set_info: Dictionary containing a mapping from fold name to file name for
@@ -333,7 +352,10 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
333352
process_label_file(fold, fold_file, label_csv_file, None, temp_file.name)
334353

335354
logging.info("Correcting sentence boundaries for fold '{}'".format(fold))
336-
process_sentence_file(fold, temp_file.name, sentence_json_file, target_file)
355+
process_sentence_file(fold, temp_file.name, sentence_json_file, temp_file.name)
356+
357+
logging.info("Correcting token errors for fold'{}'".format(fold))
358+
process_token_file(fold,temp_file.name,sentence_json_file, token_edits_json_file,target_file)
337359

338360
logging.info("Corrected corpus fold '{}' to file: '{}'".format(fold, target_file))
339361

@@ -358,6 +380,10 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
358380
default=os.path.join("corrected_labels",
359381
"sentence_corrections.json"))
360382

383+
parser.add_argument("--token_corrections_file", type=str,
384+
default=os.path.join("corrected_labels",
385+
"token_corrections.json"))
386+
361387
parser.add_argument("--corpus_fold", type=str,
362388
help="Correct only a specific fold of the corpus if specified as "
363389
"[train|dev|test], otherwise with correct the entire corpus")
@@ -375,6 +401,7 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
375401
"original_corpus_dir": args.original_corpus_dir,
376402
"corrected_corpus_dir": args.original_corpus_dir,
377403
"label_corrections_file": args.label_corrections_file,
404+
"token_corrections_file": args.token_corrections_file,
378405
"sentence_boundary_corrections_file": args.sentence_boundary_corrections_file,
379406
}
380407

@@ -387,6 +414,7 @@ def apply_corrections(data_set_info, label_csv_file, sentence_json_file, target_
387414
get_or_download_corpus(target_dir=args.original_corpus_dir),
388415
args.label_corrections_file,
389416
args.sentence_boundary_corrections_file,
417+
args.token_corrections_file,
390418
args.corrected_corpus_dir,
391419
args.corpus_fold
392420
)

0 commit comments

Comments
 (0)