@@ -50,10 +50,43 @@ def remove_punc(s):
5050 return start_idx , end_idx
5151
5252
53- def find_token_index_v2 (words , phrase ):
54- start_idx , end_idx = - 1 , - 1
53+ def verify_result (data ):
54+ def remove_punctuation (s ):
55+ for c in ['-LRB-' , '-RRB-' , '-LSB-' , '-RSB-' , '-LCB-' , '-RCB-' , '\xa0 ' ]:
56+ s = s .replace (c , '' )
57+ s = re .sub (r'[^\w]' , '' , s )
58+ return s
5559
56- return start_idx , end_idx
60+ def check_diff (words , phrase ):
61+ return remove_punctuation (phrase ) not in remove_punctuation (words )
62+
63+ for item in data :
64+ words = item ['words' ]
65+ for entity_mention in item ['golden-entity-mentions' ]:
66+ if check_diff ('' .join (words [entity_mention ['start' ]:entity_mention ['end' ]]), entity_mention ['text' ].replace (' ' , '' )):
67+ print ('============================' )
68+ print ('[Warning] entity has invalid start/end' )
69+ print ('Expected: ' , entity_mention ['text' ])
70+ print ('Actual:' , words [entity_mention ['start' ]:entity_mention ['end' ]])
71+ print ('start: {}, end: {}, words: {}' .format (entity_mention ['start' ], entity_mention ['end' ], words ))
72+
73+ for event_mention in item ['golden-event-mentions' ]:
74+ trigger = event_mention ['trigger' ]
75+ if check_diff ('' .join (words [trigger ['start' ]:trigger ['end' ]]), trigger ['text' ].replace (' ' , '' )):
76+ print ('============================' )
77+ print ('[Warning] trigger has invalid start/end' )
78+ print ('Expected: ' , trigger ['text' ])
79+ print ('Actual:' , words [trigger ['start' ]:trigger ['end' ]])
80+ print ('start: {}, end: {}, words: {}' .format (trigger ['start' ], trigger ['end' ], words ))
81+ for argument in event_mention ['arguments' ]:
82+ if check_diff ('' .join (words [argument ['start' ]:argument ['end' ]]), argument ['text' ].replace (' ' , '' )):
83+ print ('============================' )
84+ print ('[Warning] argument has invalid start/end' )
85+ print ('Expected: ' , argument ['text' ])
86+ print ('Actual:' , words [argument ['start' ]:argument ['end' ]])
87+ print ('start: {}, end: {}, words: {}' .format (argument ['start' ], argument ['end' ], words ))
88+
89+ print ('Complete verification' )
5790
5891
5992def preprocessing (data_type , files ):
@@ -109,10 +142,6 @@ def preprocessing(data_type, files):
109142 end_pos = position [1 ] - sent_start_pos + 1 ,
110143 phrase = entity_mention ['text' ],
111144 )
112- # start_idx, end_idx = find_token_index_v2(
113- # words=data['words'],
114- # phrase=entity_mention['text'],
115- # )
116145
117146 entity_mention ['start' ] = start_idx
118147 entity_mention ['end' ] = end_idx
@@ -131,10 +160,6 @@ def preprocessing(data_type, files):
131160 end_pos = position [1 ] - sent_start_pos + 1 ,
132161 phrase = event_mention ['trigger' ]['text' ],
133162 )
134- # start_idx, end_idx = find_token_index_v2(
135- # words=data['words'],
136- # phrase=event_mention['trigger']['text'],
137- # )
138163
139164 event_mention ['trigger' ]['start' ] = start_idx
140165 event_mention ['trigger' ]['end' ] = end_idx
@@ -151,10 +176,7 @@ def preprocessing(data_type, files):
151176 end_pos = position [1 ] - sent_start_pos + 1 ,
152177 phrase = argument ['text' ],
153178 )
154- # start_idx, end_idx = find_token_index_v2(
155- # words=data['words'],
156- # phrase=argument['text'],
157- # )
179+
158180 argument ['start' ] = start_idx
159181 argument ['end' ] = end_idx
160182 del argument ['position' ]
@@ -172,6 +194,7 @@ def preprocessing(data_type, files):
172194 print ('entity :' , entity_count )
173195 print ('argument:' , argument_count )
174196
197+ verify_result (result )
175198 with open ('output/{}.json' .format (data_type ), 'w' ) as f :
176199 json .dump (result , f , indent = 2 )
177200
0 commit comments