Skip to content

Commit 664e8f2

Browse files
committed
Partially cleaned up the code of Arabic FixEdeprels.
1 parent 183da4e commit 664e8f2

File tree

1 file changed

+106
-52
lines changed

1 file changed

+106
-52
lines changed

udapi/block/ud/ar/fixedeprels.py

Lines changed: 106 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ class FixEdeprels(Block):
136136
'بِ_صَدَد': 'بِصَدَدِ:gen', # biṣadadi = with respect to
137137
'بِ_صَرف_نَظَر_عَن': 'بِصَرفِ_اَلنَّظَرِ_عَن:gen', # biṣarfi an-naẓari ʿan = regardless of
138138
'بِ_صِفَة': 'بِصِفَةِ:gen', # biṣifati = as
139+
'بِ_صُورَة': 'بِ:gen',
139140
'بِ_عَكس': 'بِ:gen',
140141
'بِ_عَلَى': 'بِ:gen',
141142
'بِ_عَن': 'بِ:gen',
@@ -196,6 +197,7 @@ class FixEdeprels(Block):
196197
'بَينَ': 'بَينَ:gen', # bayna = between
197198
'بَينَ_حَوَالَى': 'بَينَ:gen', # bayna hawala
198199
'بينا': 'بَينَ:gen', # bayna = between
200+
'بَينَ_وَ_وَ_وَ': 'بَينَ:gen', # bayna = between
199201
'بَينَمَا': 'بَينَ:gen',
200202
'بَينَمَا_لَم': 'بَينَ:gen',
201203
'تُجَاهَ': 'تُجَاهَ:gen', # tuǧāha = towards, facing
@@ -222,8 +224,10 @@ class FixEdeprels(Block):
222224
'خَارِجَ': 'خَارِجَ:gen', # ḫāriǧa = outside
223225
'خِلَالَ': 'خِلَالَ:gen', # ḫilāla = during
224226
'خَلفَ': 'خَلفَ:gen', # ḫalfa = behind
225-
'دَاخِل': 'دَاخِلَ:gen', # dāḫila = inside of
226-
'دَاخِلَ': 'دَاخِلَ:gen', # dāḫila = inside of
227+
'دَاخِل':
228+
'دَاخِلَ:gen', # dāḫila = inside of
229+
'دَاخِلَ':
230+
'دَاخِلَ:gen', # dāḫila = inside of
227231
'دُونَ': 'دُونَ:gen', # dūna = without
228232
'دُونَ_أَن': 'دُونَ:gen', # dūna ʾan = without
229233
'دُونَ_سِوَى': 'دُونَ:gen', # dūna siwā = without
@@ -245,9 +249,12 @@ class FixEdeprels(Block):
245249
'سِوَى_لِ': 'سِوَى:gen', # siwā = except for
246250
'ضِدَّ': 'ضِدَّ:gen', # ḍidda = against
247251
'ضِمنَ': 'ضِمنَ:gen', # ḍimna = within, inside, among
248-
'طَالَمَا': 'طَالَمَا', # ṭālamā = as long as
249-
'طالَما': 'طَالَمَا:gen',
250-
'طَالَمَا_أَنَّ': 'طَالَمَا', # ṭālamā = as long as
252+
'طَالَمَا':
253+
'طَالَمَا', # ṭālamā = as long as
254+
'طالَما':
255+
'طَالَمَا', # ṭālamā = as long as
256+
'طَالَمَا_أَنَّ':
257+
'طَالَمَا', # ṭālamā = as long as
251258
'طِوَالَ': 'طِوَالَ:gen', # ṭiwāla = throughout
252259
'طِيلَةَ': 'طِيلَةَ:gen', # ṭīlata = during
253260
'عبر': 'عَبرَ:gen',
@@ -265,18 +272,32 @@ class FixEdeprels(Block):
265272
'عَلَى_أَسَاس_أَنَّ': 'عَلَى_أَسَاسٍ:gen', # ʿalā ʾasāsin = based on
266273
'عَلَى_اِعتِبَار_أَنَّ': 'عَلَى_اِعتِبَارِ_أَنَّ', # ʿalā iʿtibāri ʾanna = considering that
267274
'عَلَى_إِلَّا': 'إِلَّا', # ʾillā = except, unless
268-
'عَلَى_الفور': 'عَلَى:gen',
269-
'عَلَى_إِلَى': 'عَلَى:gen',
270-
'عَلَى_أَن': 'عَلَى:gen', # ʿalā = on
271-
'عَلَى_أَنَّ': 'عَلَى:gen', # ʿalā = on
272-
'عَلَى_أَن_بِ': 'عَلَى:gen', # ʿalā = on
273-
'عَلَى_أَنَّ_مِن_شَأن': 'عَلَى:gen', # ʿalā = on
274-
'عَلَى_أَنَّ_هُوَ': 'عَلَى:gen', # ʿalā = on
275-
'عَلَى_أَنَّ_هُوَ_لَدَى': 'عَلَى:gen', # ʿalā = on
276-
'عَلَى_بِ': 'عَلَى:gen',
277-
'عَلَى_بِ_فِي': 'عَلَى:gen',
278-
'عَلَى_بَينَ': 'عَلَى:gen',
279-
'عَلَى_حَدّ': 'عَلَى:gen',
275+
'عَلَى_الفور':
276+
'عَلَى:gen', # ʿalā = on
277+
'عَلَى_إِلَى':
278+
'عَلَى:gen', # ʿalā = on
279+
'عَلَى_أَن':
280+
'عَلَى:gen', # ʿalā = on
281+
'عَلَى_أَنَّ':
282+
'عَلَى:gen', # ʿalā = on
283+
'عَلَى_أَن_بِ':
284+
'عَلَى:gen', # ʿalā = on
285+
'عَلَى_أَنَّ_عَلَى':
286+
'عَلَى:gen', # ʿalā = on
287+
'عَلَى_أَنَّ_مِن_شَأن':
288+
'عَلَى:gen', # ʿalā = on
289+
'عَلَى_أَنَّ_هُوَ':
290+
'عَلَى:gen', # ʿalā = on
291+
'عَلَى_أَنَّ_هُوَ_لَدَى':
292+
'عَلَى:gen', # ʿalā = on
293+
'عَلَى_بِ':
294+
'عَلَى:gen', # ʿalā = on
295+
'عَلَى_بِ_فِي':
296+
'عَلَى:gen', # ʿalā = on
297+
'عَلَى_بَينَ':
298+
'عَلَى:gen', # ʿalā = on
299+
'عَلَى_حَدّ':
300+
'عَلَى:gen', # ʿalā = on
280301
'عَلَى_حِسَاب': 'عَلَى_حِسَابِ:gen', # ʿalā ḥisābi = at the expense of
281302
'عَلَى_حَسَبَ': 'حَسَبَ:gen', # ḥasaba = according to, depending on
282303
'عَلَى_حَولَ': 'عَلَى:gen',
@@ -526,6 +547,7 @@ class FixEdeprels(Block):
526547
'وَرَاءَ': 'وَرَاءَ:gen', # warāʾa = behind, past, beyond
527548
'وَسطَ': 'وَسطَ:gen', # wasṭa = in the middle
528549
'وِفقَ': 'وِفقَ:gen', # wifqa = according to
550+
'وِفق_لِ': 'وِفقَ:gen', # wifqa = according to
529551
'ولو': 'إِذَا', # walaw = even if
530552
'ولو_أَنَّ': 'إِذَا' # walaw = even if
531553
}
@@ -543,47 +565,79 @@ def copy_case_from_adposition(self, node, adposition):
543565
else:
544566
return None
545567

546-
def process_node(self, node):
568+
@staticmethod
569+
def compose_edeprel(bdeprel, cdeprel):
570+
"""
571+
Composes enhanced deprel from the basic part and optional case
572+
enhancement.
573+
574+
Parameters
575+
----------
576+
bdeprel : str
577+
Basic deprel (can include subtype, e.g., 'acl:relcl').
578+
cdeprel : TYPE
579+
Case enhancement (can be composed of adposition and morphological
580+
case, e.g., 'k:dat'). It is optional and it can be None or empty
581+
string if there is no case enhancement.
582+
583+
Returns
584+
-------
585+
Full enhanced deprel (str).
586+
"""
587+
assert(bdeprel[-1] != ':')
588+
edeprel = bdeprel
589+
if cdeprel:
590+
assert(cdeprel[0] != ':')
591+
edeprel += ':'+cdeprel
592+
return edeprel
593+
594+
def process_tree(self, tree):
547595
"""
548596
Occasionally the edeprels automatically derived from the Czech basic
549597
trees do not match the whitelist. For example, the noun is an
550598
abbreviation and its morphological case is unknown.
599+
600+
We cannot use the process_node() method because it ignores empty nodes.
551601
"""
552-
for edep in node.deps:
553-
m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel'])
554-
if m:
555-
solved = False
556-
# Arabic clauses often start with وَ wa "and", which does not add
557-
# much to the meaning but sometimes gets included in the enhanced
558-
# case label. Remove it if there are more informative subsequent
559-
# morphs.
560-
edep['deprel'] = re.sub(r':وَ_', r':', edep['deprel'])
561-
edep['deprel'] = re.sub(r':وَ:', r':', edep['deprel'])
562-
edep['deprel'] = re.sub(r':وَ$', r'', edep['deprel'])
563-
# If one of the following expressions occurs followed by another preposition
564-
# or by morphological case, remove the additional case marking. For example,
565-
# 'jako_v' becomes just 'jako'.
566-
re_prefix = r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'
567-
re_suffix = r'([_:].+)?$'
568-
for x in self.outermost:
569-
exceptions = self.outermost[x]
570-
m = re.match(re_prefix + x + re_suffix, edep['deprel'])
571-
if m and (not m.group(2) or not (x + m.group(2)) in exceptions):
572-
edep['deprel'] = m.group(1)+':'+x
573-
solved = True
574-
break
575-
if solved:
576-
continue
577-
for x in self.unambiguous:
578-
# All secondary prepositions have only one fixed morphological case
579-
# they appear with, so we can replace whatever case we encounter with the correct one.
580-
m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'(?::(?:nom|gen|dat|acc|voc|loc|ins))?$', edep['deprel'])
581-
if m:
582-
edep['deprel'] = m.group(1)+':'+self.unambiguous[x]
583-
solved = True
584-
break
585-
if solved:
602+
for node in tree.descendants_and_empty:
603+
for edep in node.deps:
604+
if edep['deprel'] == 'advcl:pred:إِذَن':
605+
edep['deprel'] = 'advcl:pred'
586606
continue
607+
m = re.fullmatch(r'(obl(?::arg)?|nmod|advcl(?::pred)?|acl(?::relcl)?):(.+)', edep['deprel'])
608+
if m:
609+
bdeprel = m.group(1)
610+
cdeprel = m.group(2)
611+
solved = False
612+
# Arabic clauses often start with وَ wa "and", which does not add
613+
# much to the meaning but sometimes gets included in the enhanced
614+
# case label. Remove it if there are more informative subsequent
615+
# morphs.
616+
cdeprel = re.sub(r'^وَ_', r'', cdeprel)
617+
cdeprel = re.sub(r'^وَ:', r'', cdeprel)
618+
cdeprel = re.sub(r'^وَ$', r'', cdeprel)
619+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
620+
# If one of the following expressions occurs followed by another preposition
621+
# or by morphological case, remove the additional case marking.
622+
for x in self.outermost:
623+
exceptions = self.outermost[x]
624+
m = re.fullmatch(x+r'([_:].+)?', cdeprel)
625+
if m and m.group(1) and not x+m.group(1) in exceptions:
626+
cdeprel = x
627+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
628+
solved = True
629+
break
630+
if solved:
631+
continue
632+
# Split preposition from morphological case (if any), normalize
633+
# the preposition and add the fixed morphological case where
634+
# applicable.
635+
m = re.fullmatch(r'([^:]+):(nom|gen|acc)', cdeprel)
636+
adposition = m.group(1) if m else cdeprel
637+
if adposition in self.unambiguous:
638+
cdeprel = self.unambiguous[adposition]
639+
edep['deprel'] = self.compose_edeprel(bdeprel, cdeprel)
640+
continue
587641

588642
def set_basic_and_enhanced(self, node, parent, deprel, edeprel):
589643
'''

0 commit comments

Comments
 (0)