Skip to content

Commit f75c041

Browse files
authored
For span corrections, for the first token, type should be "B-" if the previous token is "B-" or "I-" (#20)
1 parent b334eac commit f75c041

1 file changed

Lines changed: 20 additions & 2 deletions

File tree

scripts/download_and_correct_corpus.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def correct_span(self, corpus_span, correct_span, doc_num):
153153
if tag != 'O': # We only want the part after I/B-
154154
_, tag = self.dataset_lines[corpus_begin_linum].rsplit(sep='-', maxsplit=1)
155155
else:
156-
print(f"{corpus_span} has an invalid tag {tag}")
156+
print(f"{corpus_span} has an invalid tag {tag}", file=sys.stderr)
157157

158158
# correct using the correct span
159159
begin_linum, end_linum = self.find(correct_span, doc_num)
@@ -164,10 +164,28 @@ def correct_span(self, corpus_span, correct_span, doc_num):
164164
for linum in range(begin_linum, end_linum + 1):
165165
line = self.dataset_lines[linum]
166166
prefix, _ = line.rsplit(maxsplit=1)
167-
correct_line = ' '.join((prefix, f'I-{tag}'))
167+
168+
# Determine type
169+
if linum == begin_linum and linum != 0:
170+
type_ = self._determine_type(self.dataset_lines[linum - 1], line)
171+
else:
172+
type_ = "I"
173+
174+
correct_line = ' '.join((prefix, f'{type_}-{tag}'))
168175
self.dataset_lines[linum] = correct_line
169176
# TODO: May need to correct examine "I-" to "B-" following this line
170177

178+
def _determine_type(self, prev_line, current_line_tag):
179+
'Determine whether the current line should be "I-" or "B-".'
180+
181+
type_and_tag = prev_line.rsplit(maxsplit=1)
182+
if (len(type_and_tag) == 2 and type_and_tag[1].startswith(("I-", "B-")) and
183+
type_and_tag[1].endswith(f"-{current_line_tag}")):
184+
# previous line is I or B type and has the same tag
185+
return "B"
186+
187+
return "I"
188+
171189
def save(self):
172190
"Return the corrected dataset file."
173191

0 commit comments

Comments
 (0)