@@ -153,7 +153,7 @@ def correct_span(self, corpus_span, correct_span, doc_num):
153153 if tag != 'O' : # We only want the part after I/B-
154154 _ , tag = self .dataset_lines [corpus_begin_linum ].rsplit (sep = '-' , maxsplit = 1 )
155155 else :
156- print (f"{ corpus_span } has an invalid tag { tag } " )
156+ print (f"{ corpus_span } has an invalid tag { tag } " , file = sys . stderr )
157157
158158 # correct using the correct span
159159 begin_linum , end_linum = self .find (correct_span , doc_num )
@@ -164,10 +164,28 @@ def correct_span(self, corpus_span, correct_span, doc_num):
164164 for linum in range (begin_linum , end_linum + 1 ):
165165 line = self .dataset_lines [linum ]
166166 prefix , _ = line .rsplit (maxsplit = 1 )
167- correct_line = ' ' .join ((prefix , f'I-{ tag } ' ))
167+
168+ # Determine type
169+ if linum == begin_linum and linum != 0 :
170+ type_ = self ._determine_type (self .dataset_lines [linum - 1 ], line )
171+ else :
172+ type_ = "I"
173+
174+ correct_line = ' ' .join ((prefix , f'{ type_ } -{ tag } ' ))
168175 self .dataset_lines [linum ] = correct_line
169176 # TODO: May need to correct examine "I-" to "B-" following this line
170177
178+ def _determine_type (self , prev_line , current_line_tag ):
179+ 'Determine whether the current line should be "I-" or "B-".'
180+
181+ type_and_tag = prev_line .rsplit (maxsplit = 1 )
182+ if (len (type_and_tag ) == 2 and type_and_tag [1 ].startswith (("I-" , "B-" )) and
183+ type_and_tag [1 ].endswith (f"-{ current_line_tag } " )):
184+ # previous line is I or B type and has the same tag
185+ return "B"
186+
187+ return "I"
188+
171189 def save (self ):
172190 "Return the corrected dataset file."
173191
0 commit comments