Skip to content

Commit 9a22d2b

Browse files
escape comma
1 parent c68de12 commit 9a22d2b

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

docs/ssyn2es.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def load_synonyms(files, output_predicate, discard_punctuation):
3030
continue
3131

3232
entry = line.split(",")[0:9]
33-
headword = unescape(entry[8])
33+
headword = escape_comma(unescape_unicode_literal(entry[8]))
3434

3535
is_deleted = (entry[2] == "2")
3636
is_predicate = (entry[1] == "2")
@@ -55,10 +55,14 @@ def _repl_uncode_literal(m):
5555
return chr(int(m.group(1).strip("{}"), 16))
5656

5757

58-
def unescape(word):
58+
def unescape_unicode_literal(word):
5959
return unicode_literal_pattern.sub(_repl_uncode_literal, word)
6060

6161

62+
def escape_comma(word):
63+
return word.replace(",", "\,")
64+
65+
6266
# Unicode General Category list, that is used for punctuation in elasticsearch_sudachi
6367
# see: com.worksap.nlp.lucene.sudachi.ja.util.Strings
6468
punctuation_categories = [

0 commit comments

Comments
 (0)