add verification func for result

bowbowbow · bowbowbow · commit db784f18d038 · 2019-09-13T23:35:58.000+09:00
diff --git a/README.md b/README.md
@@ -119,7 +119,7 @@ This project use the same data partitioning as the previous work ([Yang and Mitc
 Below is information about the amount of parsed data when using this project. It is slightly different from the parsing results of the two papers above. The difference seems to have occurred because there are no promised rules for splitting sentences within the sgm format files.
 
 |          | Documents    |  Sentences   |Triggers    | Arguments | Entity Mentions  |
-|-------   |-----------|-----------   |---------------|----------------- |----------------- |
-| Test     | 40        | 713           | 424           | 878             |  4226             |
-| Dev      | 30        | 875           | 505           | 906             |  4050             |
-| Train    | 529       | 14724         | 4420          | 7147             |   53045            |
+|-------   |--------------|--------------|------------|-----------|----------------- |
+| Test     | 40        | 713           | 424           | 892             |  4226             |
+| Dev      | 30        | 875           | 505           | 933             |  4050             |
+| Train    | 529       | 14724         | 4420          | 7811             |   53045            |
diff --git a/main.py b/main.py
@@ -50,10 +50,43 @@ def remove_punc(s):
     return start_idx, end_idx
 
 
-def find_token_index_v2(words, phrase):
-    start_idx, end_idx = -1, -1
+def verify_result(data):
+    def remove_punctuation(s):
+        for c in ['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-', '\xa0']:
+            s = s.replace(c, '')
+        s = re.sub(r'[^\w]', '', s)
+        return s
 
-    return start_idx, end_idx
+    def check_diff(words, phrase):
+        return remove_punctuation(phrase) not in remove_punctuation(words)
+
+    for item in data:
+        words = item['words']
+        for entity_mention in item['golden-entity-mentions']:
+            if check_diff(''.join(words[entity_mention['start']:entity_mention['end']]), entity_mention['text'].replace(' ', '')):
+                print('============================')
+                print('[Warning] entity has invalid start/end')
+                print('Expected: ', entity_mention['text'])
+                print('Actual:', words[entity_mention['start']:entity_mention['end']])
+                print('start: {}, end: {}, words: {}'.format(entity_mention['start'], entity_mention['end'], words))
+
+        for event_mention in item['golden-event-mentions']:
+            trigger = event_mention['trigger']
+            if check_diff(''.join(words[trigger['start']:trigger['end']]), trigger['text'].replace(' ', '')):
+                print('============================')
+                print('[Warning] trigger has invalid start/end')
+                print('Expected: ', trigger['text'])
+                print('Actual:', words[trigger['start']:trigger['end']])
+                print('start: {}, end: {}, words: {}'.format(trigger['start'], trigger['end'], words))
+            for argument in event_mention['arguments']:
+                if check_diff(''.join(words[argument['start']:argument['end']]), argument['text'].replace(' ', '')):
+                    print('============================')
+                    print('[Warning] argument has invalid start/end')
+                    print('Expected: ', argument['text'])
+                    print('Actual:', words[argument['start']:argument['end']])
+                    print('start: {}, end: {}, words: {}'.format(argument['start'], argument['end'], words))
+
+    print('Complete verification')
 
 
 def preprocessing(data_type, files):
@@ -109,10 +142,6 @@ def preprocessing(data_type, files):
                     end_pos=position[1] - sent_start_pos + 1,
                     phrase=entity_mention['text'],
                 )
-                # start_idx, end_idx = find_token_index_v2(
-                #     words=data['words'],
-                #     phrase=entity_mention['text'],
-                # )
 
                 entity_mention['start'] = start_idx
                 entity_mention['end'] = end_idx
@@ -131,10 +160,6 @@ def preprocessing(data_type, files):
                     end_pos=position[1] - sent_start_pos + 1,
                     phrase=event_mention['trigger']['text'],
                 )
-                # start_idx, end_idx = find_token_index_v2(
-                #     words=data['words'],
-                #     phrase=event_mention['trigger']['text'],
-                # )
 
                 event_mention['trigger']['start'] = start_idx
                 event_mention['trigger']['end'] = end_idx
@@ -151,10 +176,7 @@ def preprocessing(data_type, files):
                         end_pos=position[1] - sent_start_pos + 1,
                         phrase=argument['text'],
                     )
-                    # start_idx, end_idx = find_token_index_v2(
-                    #     words=data['words'],
-                    #     phrase=argument['text'],
-                    # )
+
                     argument['start'] = start_idx
                     argument['end'] = end_idx
                     del argument['position']
@@ -172,6 +194,7 @@ def preprocessing(data_type, files):
     print('entity :', entity_count)
     print('argument:', argument_count)
 
+    verify_result(result)
     with open('output/{}.json'.format(data_type), 'w') as f:
         json.dump(result, f, indent=2)