-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* issue #72: updating readme * issue #72: updating readme * issue #72 bug fixes in finite-state source code, readme updates * issue #72 including Python module for the extraction of noun and adjective bases * issue #72 including Python module for the extraction of noun and adjective bases
- Loading branch information
Showing
25 changed files
with
9,971 additions
and
346,679 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
#! /usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Author: Leonel Figueiredo de Alencar | ||
# [email protected] | ||
# Date: April 20, 2018, updated February 18, 2020 | ||
|
||
"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper: | ||
ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. | ||
ISSN 1983-3652 | ||
DOI: 10.17851/1983-3652.11.3.1-25 | ||
http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. | ||
Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline: | ||
Input in MorphoBr' format: | ||
agulhão agulha+N+AUG+M+SG | ||
agulhões agulha+N+AUG+M+PL | ||
agulhona agulha+N+AUG+F+SG | ||
agulhonas agulha+N+AUG+F+PL | ||
Output generated by this module (written to different files): | ||
a g u l h a +N +AUG | ||
a g u l h ã o | ||
a g u l h a +N +AUG | ||
a g u l h õ e s | ||
a g u l h a +N +AUG | ||
a g u l h o n a | ||
a g u l h a +N +AUG | ||
a g u l h o n a s | ||
""" | ||
import os, sys, re | ||
|
||
EXCLUDE_TAGS=["+DIM","+SUPER"] | ||
EXTENSION=".stxt" | ||
|
||
"""Regex pattern matching itens that can not function as bases for | ||
morphological derivations. This includes one or more consonants before a space | ||
at the beginning of a line, for example: | ||
b b+N+M+SG | ||
c c+N+M+SG | ||
d d+N+M+SG | ||
These itens are in fact abbreviations. As such, they cannot feed diminutive formation, | ||
e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of | ||
letter b). | ||
The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.). | ||
""" | ||
CONS="[bcdfghjklmnpqrstvwxyz]" | ||
ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS)) | ||
|
||
aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w") | ||
aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w") | ||
aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w") | ||
aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w") | ||
|
||
wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w") | ||
wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w") | ||
wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w") | ||
wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w") | ||
|
||
masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w") | ||
fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w") | ||
masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w") | ||
fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w") | ||
|
||
other_m_sg = open("other_m_sg%s" % EXTENSION,"w") | ||
other_m_pl = open("other_m_pl%s" % EXTENSION,"w") | ||
other_f_sg = open("other_f_sg%s" % EXTENSION,"w") | ||
other_f_pl = open("other_f_pl%s" % EXTENSION,"w") | ||
|
||
def extract_entries(infile): | ||
return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ] | ||
|
||
def split_entry(entry): | ||
return re.split(r"\s+",entry) | ||
|
||
def exclude_abbr(entry): | ||
if ABB.match(entry): | ||
return True | ||
return False | ||
|
||
def exclude_tag(entry): | ||
for tag in EXCLUDE_TAGS: | ||
if tag in entry: | ||
return True | ||
return False | ||
|
||
def ignore_entry(entry): | ||
if entry == "" or exclude_tag(entry) or exclude_abbr(entry): | ||
return False | ||
else: | ||
return True | ||
|
||
def space(word): | ||
return " ".join(list(word)) | ||
|
||
def convert_entry(word,lemma,tags): | ||
return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word)) | ||
|
||
def parse_entry(entry): | ||
word,parse=split_entry(entry) | ||
lemma,tags=re.split(r"\+",parse,1) | ||
return word,lemma,tags | ||
|
||
def WordLemmaInS(word,lemma): | ||
if word.endswith("s") and lemma.endswith("s") and word == lemma: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def NonCanonGendMarker(word,tags): | ||
if ("-" in word and "+M+PL" in tags and word.endswith("a") | ||
or "-" in word and "+F+PL" in tags and word.endswith("o") | ||
or "+M+SG" in tags and word.endswith("a") | ||
or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga' | ||
or "+F+SG" in tags and word.endswith("o") | ||
or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo' | ||
): | ||
return True | ||
else: | ||
return False | ||
|
||
def write_entries(entries): | ||
for entry in entries: | ||
word,lemma,tags=parse_entry(entry) | ||
if "+AUG" in tags: | ||
stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8") | ||
if "+M+SG" in tags: | ||
aug_m_sg.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
aug_m_pl.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
aug_f_sg.write("%s\n\n" % stxt) | ||
else: | ||
aug_f_pl.write("%s\n\n" % stxt) | ||
|
||
elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags, | ||
# but including other tags besides the category tag (this may be useful in the future) | ||
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") | ||
if "+M+SG" in tags: | ||
wdlm_in_s_m_sg.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
wdlm_in_s_m_pl.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
wdlm_in_s_f_sg.write("%s\n\n" % stxt) | ||
else: | ||
wdlm_in_s_f_pl.write("%s\n\n" % stxt) | ||
|
||
elif NonCanonGendMarker(word,tags): # TODO: see the above comment | ||
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") | ||
if "+M+SG" in tags: | ||
masc_in_a_sg.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
fem_in_o_sg.write("%s\n\n" % stxt) | ||
#else: # discard plural forms | ||
# this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) | ||
#pass | ||
elif "+F+PL" in tags: | ||
fem_in_o_pl.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
masc_in_a_pl.write("%s\n\n" % stxt) | ||
else: | ||
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above) | ||
if "+M+SG" in tags: | ||
other_m_sg.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
other_m_pl.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
other_f_sg.write("%s\n\n" % stxt) | ||
else: | ||
other_f_pl.write("%s\n\n" % stxt) | ||
|
||
def main(): | ||
for infile in sys.argv[1:]: | ||
entries=extract_entries(infile) | ||
write_entries(entries) | ||
aug_m_sg.close() | ||
aug_m_pl.close() | ||
aug_f_sg.close() | ||
aug_f_pl.close() | ||
wdlm_in_s_m_sg.close() | ||
wdlm_in_s_m_pl.close() | ||
wdlm_in_s_f_sg.close() | ||
wdlm_in_s_f_pl.close() | ||
masc_in_a_sg.close() | ||
fem_in_o_sg.close() | ||
masc_in_a_pl.close() | ||
fem_in_o_pl.close() | ||
other_m_sg.close() | ||
other_m_pl.close() | ||
other_f_sg.close() | ||
other_f_pl.close() | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# Author: Leonel F. de Alencar, Federal University of Ceará | ||
# Date: April 16, 2018 | ||
# Author: Leonel F. de Alencar, [email protected], Federal University of Ceará | ||
# Date: April 27, 2018, bug corrections February 17, 2020 | ||
|
||
# Implementation of diminutive formation in Portuguese in the paradigm | ||
# of finite-state morphology (Beesley & Karttunen 2003) | ||
|
@@ -15,7 +15,7 @@ | |
# processes in Portuguese. The individual transducers are composed | ||
# into a single transducer encoding all alternation rules. | ||
|
||
# Defining a marker for words with stemms ending in s, | ||
# Defining a marker for words with stems ending in s, | ||
# e.g. "lápis", "burguês", etc. In these words, | ||
# z of -zinho suffix is deleted after a stemm's s, | ||
# e. g. "lapisinho", "burguesinhos". In other cases, | ||
|
@@ -30,6 +30,31 @@ define StemmS %$; | |
# delete this marker | ||
define DelStemmS StemmS -> 0 ; | ||
|
||
# right context defining a non-final hyphen-separated compound member | ||
define Hyph [$"-"] ; | ||
|
||
# protect accents in non-final hyphen-separated compound members from being removed by Unaccent rule | ||
define Protect [ | ||
[á -> A§ || _ Hyph ] | ||
.o. [é -> E§ || _ Hyph ] | ||
.o. [ê -> E¢ || _ Hyph] | ||
.o. [ó -> O§ || _ Hyph] | ||
.o. [ô -> O¢ || _ Hyph] | ||
.o. [í -> I§ || _ Hyph] | ||
.o. [ú -> U§ || _ Hyph] | ||
.o. [â -> A¢ || _ Hyph] | ||
]; | ||
|
||
# convert protected letters back into accented letters | ||
define Reconv [[á -> A§ ] | ||
.o. [ E§ -> é ] | ||
.o. [ E¢ -> ê ] | ||
.o. [ O§ -> ó ] | ||
.o. [ O¢ -> ô ] | ||
.o. [ I§ -> í ] | ||
.o. [ U§ -> ú ] | ||
.o. [ A¢ -> â ]]; | ||
|
||
# anterior vowels | ||
define AntVow [ e | i ] ; | ||
|
||
|
@@ -52,6 +77,15 @@ define PhonC [c -> %[ s %] || _ AntVow MorphSep ] ; | |
# convett back phone [s] to letter c | ||
define OrthC %[ s %] -> c ; | ||
|
||
|
||
# convert letter g to phone [Z] (SAMPA code for the voiced | ||
# postalveolar fricative [ʒ] in IPA) to prevent rule ChangeG | ||
# from applying in cases like herege^inha (diminitive of herege) | ||
define PhonG [g -> %[ Z %] || _ AntVow MorphSep ] ; | ||
|
||
# convett back phone [Z] to letter g | ||
define OrthG %[ Z %] -> g ; | ||
|
||
# delete ç before morpheme separator and anterior vowel | ||
define DeleteCedilla [ ç -> c || _ MorphSep AntVow ]; | ||
|
||
|
@@ -83,7 +117,10 @@ define OptDelEStemZ e (->) 0 || [z | s] _ s MorphSep z ; | |
# words with the stem ending in r, | ||
# e.g. flores^zinhas (diminutive of "flor" 'flower' in plural) | ||
# flores^zinhas => flors^zinhas | ||
define OptDelEStemR e (->) 0 || r _ s MorphSep z ; | ||
define OptDelEStemR e (->) 0 || Vow r _ s MorphSep z ; | ||
|
||
# TODO: abdômen => abdômenes => abdomenezinhos | ||
# => abdomenzinhos | ||
|
||
# composing the two previous rules in one single FST | ||
define OptDelE OptDelEStemZ .o. OptDelEStemR ; | ||
|
@@ -113,24 +150,32 @@ define Unaccent [[á -> a] .o. [é -> e] .o. [ê -> e] .o. [ó -> o] | |
define AltRules NasalBilabAssim .o. | ||
PhonC | ||
.o. | ||
PhonG | ||
.o. | ||
ThemVowDel | ||
.o. | ||
ChangeC | ||
.o. | ||
OrthC | ||
.o. | ||
ChangeG | ||
ChangeG | ||
.o. | ||
OrthG | ||
.o. | ||
OptDelE | ||
.o. | ||
PluralSDeletion | ||
.o. | ||
SuffZDeletion | ||
.o. | ||
Protect | ||
.o. | ||
IDeletion | ||
.o. | ||
Unaccent | ||
.o. | ||
Reconv | ||
.o. | ||
DeleteCedilla | ||
.o. | ||
DelStemmS | ||
|
Oops, something went wrong.