Skip to content

Commit cc357e6

Browse files
authored
Merge pull request #30 from pingzhiLi/parser
[FEATURE] Add text format segmentation
2 parents 85621a3 + c078ce3 commit cc357e6

File tree

5 files changed

+23
-4
lines changed

5 files changed

+23
-4
lines changed

AUTHORS.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,7 @@
1212

1313
[Longhu Qin](https://github.com/KenelmQLH)
1414

15+
[Pingzhi Li](https://github.com/pingzhiLi)
1516

16-
The stared contributors are the corresponding authors.
17+
18+
The stared contributors are the corresponding authors.

EduNLP/SIF/parser/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _is_formula_legal(self, formula_str):
7676
7777
"""
7878
legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64',
79-
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline']
79+
'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf']
8080
for tag in legal_tags:
8181
if tag in formula_str:
8282
return True

EduNLP/SIF/segment/segment.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,11 @@ def __init__(self, item, figures: dict = None):
8484
self._ques_mark_segments = []
8585
self._tag_segments = []
8686
self._sep_segments = []
87-
segments = re.split(r"(\$.+?\$)", item)
87+
88+
# remove $\textf{*} from the item$
89+
item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item))
90+
91+
segments = re.split(r"(\$.+?\$)", item_no_textf)
8892
for segment in segments:
8993
if not segment:
9094
continue
@@ -294,6 +298,10 @@ def seg(item, figures=None, symbol=None):
294298
>>> s2 = seg(test_item_1_str_2, symbol="fgm")
295299
>>> s2.tag_segments
296300
['\\SIFTag{stem}', '\\SIFTag{options}']
301+
>>> test_item_2 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是"
302+
>>> s2 = seg(test_item_2)
303+
>>> s2.text_segments
304+
['已知', ',则以下说法中正确的是']
297305
"""
298306
segments = SegmentList(item, figures)
299307
if symbol is not None:

EduNLP/SIF/sif.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,11 @@ def sif4sci(item: str, figures: (dict, bool) = None, safe=True, symbol: str = No
182182
[]
183183
>>> tl3.ques_mark_segments
184184
[['\\SIFChoice']]
185+
>>> test_item_3 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是"
186+
>>> tl4 = sif4sci(test_item_3)
187+
Warning: there is some chinese characters in formula!
188+
>>> tl4.text_segments
189+
[['已知'], ['说法', '中', '正确']]
185190
"""
186191
try:
187192
if safe is True and is_sif(item) is not True:

tests/test_sif/test_segement.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ def test_segment(figure0, figure1, figure0_base64, figure1_base64):
1919
r"如图所示,则$\FormFigureBase64{%s}$的面积是$\SIFBlank$。$\FigureBase64{%s}$" % (figure0_base64, figure1_base64),
2020
figures=True
2121
)
22-
2322
with pytest.raises(TypeError):
2423
s.append("123")
24+
seg_test_text = seg(
25+
r"如图所示,有三组$\textf{机器人,bu}$在踢$\textf{足球,b}$",
26+
figures=True
27+
)
28+
assert seg_test_text.text_segments == ['如图所示,有三组机器人在踢足球']

0 commit comments

Comments
 (0)