@@ -110,7 +110,13 @@ def correct_tag(self, span, right_tag, doc_num):
110110 line = self .dataset_lines [linum ]
111111 prefix , tag = line .rsplit (maxsplit = 1 )
112112 if tag == 'O' :
113- correct_line = ' ' .join ((prefix , "I-" + right_tag )) # TODO: May not be I-
113+ # Determine type
114+ if linum == begin_linum and linum != 0 :
115+ type_ = self ._determine_type (self .dataset_lines [linum - 1 ], right_tag )
116+ else :
117+ type_ = "I"
118+
119+ correct_line = ' ' .join ((prefix , f"{ type_ } -" + right_tag ))
114120 else :
115121 prefix , _ = line .rsplit (sep = '-' , maxsplit = 1 )
116122 correct_line = '-' .join ((prefix , right_tag ))
@@ -128,6 +134,11 @@ def _correct_wrong_range(self, begin_linum, end_linum):
128134 # TODO: Need to examine the correctness of nearby "B-" and "I-"
129135 self .dataset_lines [linum ] = correct_line
130136
137+ # If the line below the corrected range is "B-", it should be changed to "I-"
138+ if end_linum < len (self .dataset_lines ) - 1 : # end_linum is not the last line
139+ self .dataset_lines [end_linum + 1 ] = self ._correct_line_i_b (self .dataset_lines [end_linum ],
140+ self .dataset_lines [end_linum + 1 ])
141+
131142 def correct_wrong (self , span , doc_num ):
132143 "Correct a Wrong type error."
133144
@@ -139,9 +150,15 @@ def correct_missing(self, span, right_tag, doc_num):
139150
140151 begin_linum , end_linum = self .find (span , doc_num )
141152 for linum in range (begin_linum , end_linum + 1 ):
153+ # Determine type
154+ if linum == begin_linum and linum != 0 :
155+ type_ = self ._determine_type (self .dataset_lines [linum - 1 ], right_tag )
156+ else :
157+ type_ = "I"
158+
142159 line = self .dataset_lines [linum ]
143160 prefix , _ = line .rsplit (maxsplit = 1 )
144- correct_line = ' ' .join ((prefix , f'I -{ right_tag } ' )) # TODO: This is not necessarily "I-"
161+ correct_line = ' ' .join ((prefix , f'{ type_ } -{ right_tag } ' ))
145162 self .dataset_lines [linum ] = correct_line
146163
147164 def correct_span (self , corpus_span , correct_span , doc_num ):
@@ -167,16 +184,22 @@ def correct_span(self, corpus_span, correct_span, doc_num):
167184
168185 # Determine type
169186 if linum == begin_linum and linum != 0 :
170- type_ = self ._determine_type (self .dataset_lines [linum - 1 ], line )
187+ type_ = self ._determine_type (self .dataset_lines [linum - 1 ], tag )
171188 else :
172189 type_ = "I"
173190
174191 correct_line = ' ' .join ((prefix , f'{ type_ } -{ tag } ' ))
175192 self .dataset_lines [linum ] = correct_line
176- # TODO: May need to correct examine "I-" to "B-" following this line
193+
194+ # Next line type may need correction
195+ if end_linum < len (self .dataset_lines ) - 1 : # not last line
196+ self .dataset_lines [end_linum + 1 ] = self ._correct_line_i_b (self .dataset_lines [end_linum ],
197+ self .dataset_lines [end_linum + 1 ])
177198
178199 def _determine_type (self , prev_line , current_line_tag ):
179- 'Determine whether the current line should be "I-" or "B-".'
200+ '''Determine whether the current line should be "I-" or "B-". ``current_line_tag`` is the part after "I-" or
201+ "B-", or "O".
202+ '''
180203
181204 type_and_tag = prev_line .rsplit (maxsplit = 1 )
182205 if (len (type_and_tag ) == 2 and type_and_tag [1 ].startswith (("I-" , "B-" )) and
@@ -186,6 +209,19 @@ def _determine_type(self, prev_line, current_line_tag):
186209
187210 return "I"
188211
212+ def _correct_line_i_b (self , prev_line , current_line ):
213+ "Correct the I- and B- type of the current line."
214+
215+ prefix_and_tag = current_line .rsplit (maxsplit = 1 )
216+ if len (prefix_and_tag ) <= 1 : # blank line
217+ return current_line
218+ tag = prefix_and_tag [1 ]
219+ if tag == 'O' : # no I- or B- distinction
220+ return current_line
221+ tag = tag .rsplit ('-' , maxsplit = 1 )[1 ]
222+ type_ = self ._determine_type (prev_line , tag )
223+ return ' ' .join ((prefix_and_tag [0 ], f'{ type_ } -{ tag } ' ))
224+
189225 def save (self ):
190226 "Return the corrected dataset file."
191227
0 commit comments