Skip to content

Commit f5a2c94

Browse files
authored
More processing of ensuring the proper type (I or B) (#22)
The chance that they need to be changed is low. Indeed, our results have shown no change after adding these processing. These confirm our corrected corpus before adding the process is already complete in terms of being I- or B-. Nevertheless, for completeness, we should have these code in place.
1 parent 1a820ad commit f5a2c94

1 file changed

Lines changed: 41 additions & 5 deletions

File tree

scripts/download_and_correct_corpus.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,13 @@ def correct_tag(self, span, right_tag, doc_num):
110110
line = self.dataset_lines[linum]
111111
prefix, tag = line.rsplit(maxsplit=1)
112112
if tag == 'O':
113-
correct_line = ' '.join((prefix, "I-" + right_tag)) # TODO: May not be I-
113+
# Determine type
114+
if linum == begin_linum and linum != 0:
115+
type_ = self._determine_type(self.dataset_lines[linum - 1], right_tag)
116+
else:
117+
type_ = "I"
118+
119+
correct_line = ' '.join((prefix, f"{type_}-" + right_tag))
114120
else:
115121
prefix, _ = line.rsplit(sep='-', maxsplit=1)
116122
correct_line = '-'.join((prefix, right_tag))
@@ -128,6 +134,11 @@ def _correct_wrong_range(self, begin_linum, end_linum):
128134
# TODO: Need to examine the correctness of nearby "B-" and "I-"
129135
self.dataset_lines[linum] = correct_line
130136

137+
# If the line below the corrected range is "B-", it should be changed to "I-"
138+
if end_linum < len(self.dataset_lines) - 1: # end_linum is not the last line
139+
self.dataset_lines[end_linum + 1] = self._correct_line_i_b(self.dataset_lines[end_linum],
140+
self.dataset_lines[end_linum + 1])
141+
131142
def correct_wrong(self, span, doc_num):
132143
"Correct a Wrong type error."
133144

@@ -139,9 +150,15 @@ def correct_missing(self, span, right_tag, doc_num):
139150

140151
begin_linum, end_linum = self.find(span, doc_num)
141152
for linum in range(begin_linum, end_linum + 1):
153+
# Determine type
154+
if linum == begin_linum and linum != 0:
155+
type_ = self._determine_type(self.dataset_lines[linum - 1], right_tag)
156+
else:
157+
type_ = "I"
158+
142159
line = self.dataset_lines[linum]
143160
prefix, _ = line.rsplit(maxsplit=1)
144-
correct_line = ' '.join((prefix, f'I-{right_tag}')) # TODO: This is not necessarily "I-"
161+
correct_line = ' '.join((prefix, f'{type_}-{right_tag}'))
145162
self.dataset_lines[linum] = correct_line
146163

147164
def correct_span(self, corpus_span, correct_span, doc_num):
@@ -167,16 +184,22 @@ def correct_span(self, corpus_span, correct_span, doc_num):
167184

168185
# Determine type
169186
if linum == begin_linum and linum != 0:
170-
type_ = self._determine_type(self.dataset_lines[linum - 1], line)
187+
type_ = self._determine_type(self.dataset_lines[linum - 1], tag)
171188
else:
172189
type_ = "I"
173190

174191
correct_line = ' '.join((prefix, f'{type_}-{tag}'))
175192
self.dataset_lines[linum] = correct_line
176-
# TODO: May need to correct examine "I-" to "B-" following this line
193+
194+
# Next line type may need correction
195+
if end_linum < len(self.dataset_lines) - 1: # not last line
196+
self.dataset_lines[end_linum + 1] = self._correct_line_i_b(self.dataset_lines[end_linum],
197+
self.dataset_lines[end_linum + 1])
177198

178199
def _determine_type(self, prev_line, current_line_tag):
179-
'Determine whether the current line should be "I-" or "B-".'
200+
'''Determine whether the current line should be "I-" or "B-". ``current_line_tag`` is the part after "I-" or
201+
"B-", or "O".
202+
'''
180203

181204
type_and_tag = prev_line.rsplit(maxsplit=1)
182205
if (len(type_and_tag) == 2 and type_and_tag[1].startswith(("I-", "B-")) and
@@ -186,6 +209,19 @@ def _determine_type(self, prev_line, current_line_tag):
186209

187210
return "I"
188211

212+
def _correct_line_i_b(self, prev_line, current_line):
213+
"Correct the I- and B- type of the current line."
214+
215+
prefix_and_tag = current_line.rsplit(maxsplit=1)
216+
if len(prefix_and_tag) <= 1: # blank line
217+
return current_line
218+
tag = prefix_and_tag[1]
219+
if tag == 'O': # no I- or B- distinction
220+
return current_line
221+
tag = tag.rsplit('-', maxsplit=1)[1]
222+
type_ = self._determine_type(prev_line, tag)
223+
return ' '.join((prefix_and_tag[0], f'{type_}-{tag}'))
224+
189225
def save(self):
190226
"Return the corrected dataset file."
191227

0 commit comments

Comments
 (0)