11import os
22import copy
3+ import re
34from parser import Parser
45import json
56from stanfordcorenlp import StanfordCoreNLP
910
1011def get_data_paths (ace2005_path ):
1112 test_files , dev_files , train_files = [], [], []
12-
1313 with open ('./data_list.csv' , mode = 'r' ) as csv_file :
1414 rows = csv_file .readlines ()
1515 for row in rows [1 :]:
@@ -28,13 +28,30 @@ def get_data_paths(ace2005_path):
2828
2929
3030def find_token_index (tokens , start_pos , end_pos , phrase ):
31- start_idx = - 1
31+ start_idx , end_idx = - 1 , - 1
3232 for idx , token in enumerate (tokens ):
3333 if token ['characterOffsetBegin' ] <= start_pos :
3434 start_idx = idx
3535
36- # Some of the ACE2005 data has annotation position errors.
37- end_idx = start_idx + len (phrase .split ())
36+ assert start_idx != - 1 , "start_idx: {}, start_pos: {}, phrase: {}, tokens: {}" .format (start_idx , start_pos , phrase , tokens )
37+ chars = ''
38+
39+ def remove_punc (s ):
40+ s = re .sub (r'[^\w]' , '' , s )
41+ return s
42+
43+ for i in range (0 , len (tokens ) - start_idx ):
44+ chars += remove_punc (tokens [start_idx + i ]['originalText' ])
45+ if remove_punc (phrase ) in chars :
46+ end_idx = start_idx + i + 1
47+ break
48+
49+ assert end_idx != - 1 , "end_idx: {}, end_pos: {}, phrase: {}, tokens: {}, chars:{}" .format (end_idx , end_pos , phrase , tokens , chars )
50+ return start_idx , end_idx
51+
52+
53+ def find_token_index_v2 (words , phrase ):
54+ start_idx , end_idx = - 1 , - 1
3855
3956 return start_idx , end_idx
4057
@@ -59,10 +76,11 @@ def preprocessing(data_type, files):
5976 data ['golden-event-mentions' ] = []
6077
6178 try :
62- nlp_text = nlp .annotate (item ['sentence' ], properties = {'annotators' : 'tokenize,ssplit,pos,lemma,parse' })
63- nlp_res = json .loads (nlp_text )
79+ nlp_res_raw = nlp .annotate (item ['sentence' ], properties = {'annotators' : 'tokenize,ssplit,pos,lemma,parse' })
80+ nlp_res = json .loads (nlp_res_raw )
6481 except Exception as e :
65- print ('[Warning] StanfordCore Exception: ' , nlp_text , 'This sentence will be ignored.' )
82+ print ('[Warning] StanfordCore Exception: ' , nlp_res_raw , 'This sentence will be ignored.' )
83+ print ('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1' )
6684 continue
6785
6886 tokens = nlp_res ['sentences' ][0 ]['tokens' ]
@@ -91,6 +109,10 @@ def preprocessing(data_type, files):
91109 end_pos = position [1 ] - sent_start_pos + 1 ,
92110 phrase = entity_mention ['text' ],
93111 )
112+ # start_idx, end_idx = find_token_index_v2(
113+ # words=data['words'],
114+ # phrase=entity_mention['text'],
115+ # )
94116
95117 entity_mention ['start' ] = start_idx
96118 entity_mention ['end' ] = end_idx
@@ -109,6 +131,10 @@ def preprocessing(data_type, files):
109131 end_pos = position [1 ] - sent_start_pos + 1 ,
110132 phrase = event_mention ['trigger' ]['text' ],
111133 )
134+ # start_idx, end_idx = find_token_index_v2(
135+ # words=data['words'],
136+ # phrase=event_mention['trigger']['text'],
137+ # )
112138
113139 event_mention ['trigger' ]['start' ] = start_idx
114140 event_mention ['trigger' ]['end' ] = end_idx
@@ -125,6 +151,10 @@ def preprocessing(data_type, files):
125151 end_pos = position [1 ] - sent_start_pos + 1 ,
126152 phrase = argument ['text' ],
127153 )
154+ # start_idx, end_idx = find_token_index_v2(
155+ # words=data['words'],
156+ # phrase=argument['text'],
157+ # )
128158 argument ['start' ] = start_idx
129159 argument ['end' ] = end_idx
130160 del argument ['position' ]
@@ -155,6 +185,6 @@ def preprocessing(data_type, files):
155185 with StanfordCoreNLP ('./stanford-corenlp-full-2018-10-05' , memory = '8g' , timeout = 60000 ) as nlp :
156186 # res = nlp.annotate('Donald John Trump is current president of the United States.', properties={'annotators': 'tokenize,ssplit,pos,lemma,parse'})
157187 # print(res)
158- preprocessing ('train' , train_files )
159- preprocessing ('test' , test_files )
160188 preprocessing ('dev' , dev_files )
189+ preprocessing ('test' , test_files )
190+ preprocessing ('train' , train_files )
0 commit comments