Issue 72 (#73)

* issue #72: updating readme * issue #72: updating readme * issue #72 bug fixes in finite-state source code, readme updates * issue #72 including Python module for the extraction of noun and adjective bases * issue #72 including Python module for the extraction of noun and adjective bases
LR-POR · Jun 2, 2020 · 3ad22c4 · 3ad22c4
1 parent a92866f
commit 3ad22c4
Show file tree

Hide file tree

Showing 25 changed files with 9,971 additions and 346,679 deletions.
diff --git a/tools/fst/BuildTestTransducers.sh b/tools/fst/BuildTestTransducers.sh
diff --git a/tools/fst/ExtractWordLemmaPairs.py b/tools/fst/ExtractWordLemmaPairs.py
@@ -0,0 +1,207 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar
+# [email protected]
+# Date: April 20, 2018, updated February 18, 2020
+
+"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper:
+
+ALENCAR, Leonel Figueiredo de;  CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. 
+ISSN 1983-3652 
+DOI: 10.17851/1983-3652.11.3.1-25
+http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. 
+
+
+Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline:
+
+Input in MorphoBr' format:
+
+agulhão	agulha+N+AUG+M+SG
+agulhões	agulha+N+AUG+M+PL
+agulhona	agulha+N+AUG+F+SG
+agulhonas	agulha+N+AUG+F+PL
+
+
+Output generated by this module (written to different files):
+
+a g u l h a +N +AUG
+a g u l h ã o
+
+a g u l h a +N +AUG
+a g u l h õ e s
+
+a g u l h a +N +AUG
+a g u l h o n a
+
+a g u l h a +N +AUG
+a g u l h o n a s
+
+
+"""
+import os, sys, re
+
+EXCLUDE_TAGS=["+DIM","+SUPER"]
+EXTENSION=".stxt"
+
+"""Regex pattern matching itens that can not function as bases for
+morphological derivations. This includes one or more consonants before a space
+at the beginning of a line, for example:
+b       b+N+M+SG
+c       c+N+M+SG
+d       d+N+M+SG
+
+These itens are in fact abbreviations. As such, they cannot feed diminutive formation,
+e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of
+letter b).
+The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.).
+"""
+CONS="[bcdfghjklmnpqrstvwxyz]"
+ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS))
+
+aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w")
+aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w")
+aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w")
+aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w")
+
+wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w")
+wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w")
+wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w")
+wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w")
+
+masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w")
+fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w")
+masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w")
+fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w")
+
+other_m_sg = open("other_m_sg%s" % EXTENSION,"w")
+other_m_pl = open("other_m_pl%s" % EXTENSION,"w")
+other_f_sg = open("other_f_sg%s" % EXTENSION,"w")
+other_f_pl = open("other_f_pl%s" % EXTENSION,"w")
+
+def extract_entries(infile):
+    return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ]
+
+def split_entry(entry):
+    return re.split(r"\s+",entry)
+
+def exclude_abbr(entry):
+    if ABB.match(entry):
+        return True
+    return False
+
+def exclude_tag(entry):
+    for tag in EXCLUDE_TAGS:
+        if tag in entry:
+            return True
+    return False
+
+def ignore_entry(entry):
+    if entry == "" or exclude_tag(entry) or exclude_abbr(entry):
+        return False
+    else: 
+        return True
+
+def space(word):
+    return " ".join(list(word))
+
+def convert_entry(word,lemma,tags):
+    return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word))
+
+def parse_entry(entry):
+    word,parse=split_entry(entry)
+    lemma,tags=re.split(r"\+",parse,1)
+    return word,lemma,tags
+
+def WordLemmaInS(word,lemma):
+    if word.endswith("s") and lemma.endswith("s") and word == lemma:
+        return True
+    else: 
+        return False
+
+
+def NonCanonGendMarker(word,tags):
+    if ("-" in word and "+M+PL" in tags and word.endswith("a") 
+        or "-" in word and "+F+PL" in tags and word.endswith("o") 
+        or "+M+SG" in tags and word.endswith("a") 
+        or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga'
+        or "+F+SG" in tags and word.endswith("o")
+        or "+F+PL" in tags and word.endswith("os") #  N-N compounds like  'amostras-tipo'
+        ):
+        return True
+    else:
+        return False
+
+def write_entries(entries):
+    for entry in entries:
+        word,lemma,tags=parse_entry(entry)
+        if "+AUG" in tags:
+            stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8")
+            if "+M+SG" in tags:
+                aug_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                aug_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                aug_f_sg.write("%s\n\n" % stxt)
+            else:
+                aug_f_pl.write("%s\n\n" % stxt)
+
+        elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags,
+            # but including other tags besides the category tag (this may be useful in the future)
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") 
+            if "+M+SG" in tags:
+                wdlm_in_s_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                wdlm_in_s_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                wdlm_in_s_f_sg.write("%s\n\n" % stxt)
+            else:
+                wdlm_in_s_f_pl.write("%s\n\n" % stxt)
+
+        elif NonCanonGendMarker(word,tags): # TODO: see the above comment
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") 
+            if "+M+SG" in tags:
+                masc_in_a_sg.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                fem_in_o_sg.write("%s\n\n" % stxt)
+            #else: # discard plural forms
+                # this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) 
+                #pass
+            elif "+F+PL" in tags:
+                fem_in_o_pl.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                masc_in_a_pl.write("%s\n\n" % stxt)
+        else:
+            stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")  # TODO: tags[:-2] (see above)
+            if "+M+SG" in tags:
+                other_m_sg.write("%s\n\n" % stxt)
+            elif "+M+PL" in tags:
+                other_m_pl.write("%s\n\n" % stxt)
+            elif "+F+SG" in tags:
+                other_f_sg.write("%s\n\n" % stxt)
+            else:
+                other_f_pl.write("%s\n\n" % stxt)
+
+def main(): 
+    for infile in sys.argv[1:]:
+        entries=extract_entries(infile)
+        write_entries(entries)
+    aug_m_sg.close()
+    aug_m_pl.close()
+    aug_f_sg.close()
+    aug_f_pl.close()
+    wdlm_in_s_m_sg.close()
+    wdlm_in_s_m_pl.close()
+    wdlm_in_s_f_sg.close()
+    wdlm_in_s_f_pl.close()
+    masc_in_a_sg.close()
+    fem_in_o_sg.close()
+    masc_in_a_pl.close()
+    fem_in_o_pl.close()
+    other_m_sg.close()
+    other_m_pl.close()
+    other_f_sg.close()
+    other_f_pl.close()
+
+if __name__ == '__main__':
+	main()
diff --git a/tools/fst/alternation-rules.xfst b/tools/fst/alternation-rules.xfst
@@ -1,5 +1,5 @@
-# Author: Leonel F. de Alencar, Federal University of Ceará
-# Date: April 16, 2018
+# Author: Leonel F. de Alencar, [email protected], Federal University of Ceará
+# Date: April 27, 2018, bug corrections February 17, 2020
 
 # Implementation of diminutive formation in Portuguese in the paradigm 
 # of finite-state morphology (Beesley & Karttunen 2003)
@@ -15,7 +15,7 @@
 # processes in Portuguese. The individual transducers are composed 
 # into a single transducer encoding all alternation rules.
 
-# Defining a marker for words with stemms ending in s, 
+# Defining a marker for words with stems ending in s, 
 # e.g. "lápis", "burguês", etc. In these words,
 # z of -zinho suffix is deleted after a stemm's s,
 # e. g. "lapisinho", "burguesinhos". In other cases,
@@ -30,6 +30,31 @@ define StemmS %$;
 # delete this marker
 define DelStemmS StemmS -> 0 ;
 
+# right context defining a non-final hyphen-separated compound member
+define Hyph [$"-"] ;
+
+# protect accents in non-final hyphen-separated compound members from being removed by Unaccent rule
+define Protect [
+		       [á -> A§ || _ Hyph ] 
+       	       	   .o. [é -> E§ || _ Hyph ] 
+       	       	   .o. [ê -> E¢ || _ Hyph] 
+       	       	   .o. [ó -> O§ || _ Hyph]
+       		   .o. [ô -> O¢ || _ Hyph] 
+		   .o. [í -> I§ || _ Hyph] 
+		   .o. [ú -> U§ || _ Hyph]
+		   .o. [â -> A¢ || _ Hyph]
+		   ];
+
+# convert protected letters back into accented letters
+define Reconv [[á -> A§  ] 
+       	       	   .o. [ E§ -> é ] 
+       	       	   .o. [ E¢ -> ê ] 
+       	       	   .o. [ O§ -> ó ]
+       		   .o. [ O¢ -> ô ] 
+		   .o. [ I§ -> í ] 
+		   .o. [ U§ -> ú ]
+		   .o. [ A¢ -> â ]];
+
 # anterior vowels
 define AntVow [ e | i ] ;
 
@@ -52,6 +77,15 @@ define PhonC [c -> %[ s %] || _ AntVow MorphSep  ]  ;
 # convett back phone [s] to letter c
 define OrthC %[ s %] -> c ;
 
+
+# convert letter g to phone [Z] (SAMPA code for the voiced
+# postalveolar fricative [ʒ] in IPA) to prevent rule ChangeG
+# from applying in cases like herege^inha (diminitive of herege)
+define PhonG [g -> %[ Z %] || _ AntVow MorphSep  ]  ;
+
+# convett back phone [Z] to letter g
+define OrthG %[ Z %] -> g ;
+
 # delete ç before morpheme separator and anterior vowel
 define DeleteCedilla  [ ç -> c || _ MorphSep AntVow ];
 
@@ -83,7 +117,10 @@ define OptDelEStemZ e (->) 0 || [z | s] _ s MorphSep z ;
 # words with the stem ending in r, 
 # e.g. flores^zinhas  (diminutive of "flor" 'flower' in plural)
 # flores^zinhas => flors^zinhas
-define OptDelEStemR e (->) 0 || r _ s MorphSep z ;
+define OptDelEStemR e (->) 0 || Vow r _ s MorphSep z ;
+
+# TODO: abdômen => abdômenes => abdomenezinhos
+# => abdomenzinhos
 
 # composing the two previous rules in one single FST
 define OptDelE OptDelEStemZ .o. OptDelEStemR ;
@@ -113,24 +150,32 @@ define Unaccent [[á -> a] .o. [é -> e] .o. [ê -> e] .o. [ó -> o]
 define AltRules NasalBilabAssim .o. 
        				PhonC
 				.o.
+				PhonG
+				.o.
        				ThemVowDel 
 				.o. 
 				ChangeC 
 				.o.
 				OrthC
 				.o.
-				ChangeG 
+				ChangeG
+				.o.
+				OrthG
 				.o.
 				OptDelE
 				.o.
 				PluralSDeletion 
 				.o.
 				SuffZDeletion
 				.o.
+				Protect
+				.o.
 				IDeletion
 				.o.
 				Unaccent
 				.o.
+				Reconv
+				.o.
 				DeleteCedilla
 				.o.
 				DelStemmS