Skip to content

Commit

Permalink
Issue 72 (#73)
Browse files Browse the repository at this point in the history
* issue #72: updating readme

* issue #72: updating readme

* issue #72 bug fixes in finite-state source code, readme updates

* issue #72 including Python module for the extraction of noun and adjective bases

* issue #72 including Python module for the extraction of noun and adjective bases
  • Loading branch information
leoalenc authored Jun 2, 2020
1 parent a92866f commit 3ad22c4
Show file tree
Hide file tree
Showing 25 changed files with 9,971 additions and 346,679 deletions.
28 changes: 0 additions & 28 deletions tools/fst/BuildTestTransducers.sh

This file was deleted.

207 changes: 207 additions & 0 deletions tools/fst/ExtractWordLemmaPairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Author: Leonel Figueiredo de Alencar
# [email protected]
# Date: April 20, 2018, updated February 18, 2020

"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper:
ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018.
ISSN 1983-3652
DOI: 10.17851/1983-3652.11.3.1-25
http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294.
Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline:
Input in MorphoBr' format:
agulhão agulha+N+AUG+M+SG
agulhões agulha+N+AUG+M+PL
agulhona agulha+N+AUG+F+SG
agulhonas agulha+N+AUG+F+PL
Output generated by this module (written to different files):
a g u l h a +N +AUG
a g u l h ã o
a g u l h a +N +AUG
a g u l h õ e s
a g u l h a +N +AUG
a g u l h o n a
a g u l h a +N +AUG
a g u l h o n a s
"""
import os, sys, re

EXCLUDE_TAGS=["+DIM","+SUPER"]
EXTENSION=".stxt"

"""Regex pattern matching itens that can not function as bases for
morphological derivations. This includes one or more consonants before a space
at the beginning of a line, for example:
b b+N+M+SG
c c+N+M+SG
d d+N+M+SG
These itens are in fact abbreviations. As such, they cannot feed diminutive formation,
e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of
letter b).
The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.).
"""
CONS="[bcdfghjklmnpqrstvwxyz]"
ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS))

aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w")
aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w")
aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w")
aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w")

wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w")
wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w")
wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w")
wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w")

masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w")
fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w")
masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w")
fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w")

other_m_sg = open("other_m_sg%s" % EXTENSION,"w")
other_m_pl = open("other_m_pl%s" % EXTENSION,"w")
other_f_sg = open("other_f_sg%s" % EXTENSION,"w")
other_f_pl = open("other_f_pl%s" % EXTENSION,"w")

def extract_entries(infile):
return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ]

def split_entry(entry):
return re.split(r"\s+",entry)

def exclude_abbr(entry):
if ABB.match(entry):
return True
return False

def exclude_tag(entry):
for tag in EXCLUDE_TAGS:
if tag in entry:
return True
return False

def ignore_entry(entry):
if entry == "" or exclude_tag(entry) or exclude_abbr(entry):
return False
else:
return True

def space(word):
return " ".join(list(word))

def convert_entry(word,lemma,tags):
return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word))

def parse_entry(entry):
word,parse=split_entry(entry)
lemma,tags=re.split(r"\+",parse,1)
return word,lemma,tags

def WordLemmaInS(word,lemma):
if word.endswith("s") and lemma.endswith("s") and word == lemma:
return True
else:
return False


def NonCanonGendMarker(word,tags):
if ("-" in word and "+M+PL" in tags and word.endswith("a")
or "-" in word and "+F+PL" in tags and word.endswith("o")
or "+M+SG" in tags and word.endswith("a")
or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga'
or "+F+SG" in tags and word.endswith("o")
or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo'
):
return True
else:
return False

def write_entries(entries):
for entry in entries:
word,lemma,tags=parse_entry(entry)
if "+AUG" in tags:
stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8")
if "+M+SG" in tags:
aug_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
aug_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
aug_f_sg.write("%s\n\n" % stxt)
else:
aug_f_pl.write("%s\n\n" % stxt)

elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags,
# but including other tags besides the category tag (this may be useful in the future)
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")
if "+M+SG" in tags:
wdlm_in_s_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
wdlm_in_s_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
wdlm_in_s_f_sg.write("%s\n\n" % stxt)
else:
wdlm_in_s_f_pl.write("%s\n\n" % stxt)

elif NonCanonGendMarker(word,tags): # TODO: see the above comment
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")
if "+M+SG" in tags:
masc_in_a_sg.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
fem_in_o_sg.write("%s\n\n" % stxt)
#else: # discard plural forms
# this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020)
#pass
elif "+F+PL" in tags:
fem_in_o_pl.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
masc_in_a_pl.write("%s\n\n" % stxt)
else:
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above)
if "+M+SG" in tags:
other_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
other_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
other_f_sg.write("%s\n\n" % stxt)
else:
other_f_pl.write("%s\n\n" % stxt)

def main():
for infile in sys.argv[1:]:
entries=extract_entries(infile)
write_entries(entries)
aug_m_sg.close()
aug_m_pl.close()
aug_f_sg.close()
aug_f_pl.close()
wdlm_in_s_m_sg.close()
wdlm_in_s_m_pl.close()
wdlm_in_s_f_sg.close()
wdlm_in_s_f_pl.close()
masc_in_a_sg.close()
fem_in_o_sg.close()
masc_in_a_pl.close()
fem_in_o_pl.close()
other_m_sg.close()
other_m_pl.close()
other_f_sg.close()
other_f_pl.close()

if __name__ == '__main__':
main()
55 changes: 50 additions & 5 deletions tools/fst/alternation-rules.xfst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Author: Leonel F. de Alencar, Federal University of Ceará
# Date: April 16, 2018
# Author: Leonel F. de Alencar, [email protected], Federal University of Ceará
# Date: April 27, 2018, bug corrections February 17, 2020

# Implementation of diminutive formation in Portuguese in the paradigm
# of finite-state morphology (Beesley & Karttunen 2003)
Expand All @@ -15,7 +15,7 @@
# processes in Portuguese. The individual transducers are composed
# into a single transducer encoding all alternation rules.

# Defining a marker for words with stemms ending in s,
# Defining a marker for words with stems ending in s,
# e.g. "lápis", "burguês", etc. In these words,
# z of -zinho suffix is deleted after a stemm's s,
# e. g. "lapisinho", "burguesinhos". In other cases,
Expand All @@ -30,6 +30,31 @@ define StemmS %$;
# delete this marker
define DelStemmS StemmS -> 0 ;

# right context defining a non-final hyphen-separated compound member
define Hyph [$"-"] ;

# protect accents in non-final hyphen-separated compound members from being removed by Unaccent rule
define Protect [
[á -> A§ || _ Hyph ]
.o. [é -> E§ || _ Hyph ]
.o. [ê -> E¢ || _ Hyph]
.o. [ó -> O§ || _ Hyph]
.o. [ô -> O¢ || _ Hyph]
.o. [í -> I§ || _ Hyph]
.o. [ú -> U§ || _ Hyph]
.o. [â -> A¢ || _ Hyph]
];

# convert protected letters back into accented letters
define Reconv [[á -> A§ ]
.o. [ E§ -> é ]
.o. [ E¢ -> ê ]
.o. [ O§ -> ó ]
.o. [ O¢ -> ô ]
.o. [ I§ -> í ]
.o. [ U§ -> ú ]
.o. [ A¢ -> â ]];

# anterior vowels
define AntVow [ e | i ] ;

Expand All @@ -52,6 +77,15 @@ define PhonC [c -> %[ s %] || _ AntVow MorphSep ] ;
# convett back phone [s] to letter c
define OrthC %[ s %] -> c ;


# convert letter g to phone [Z] (SAMPA code for the voiced
# postalveolar fricative [ʒ] in IPA) to prevent rule ChangeG
# from applying in cases like herege^inha (diminitive of herege)
define PhonG [g -> %[ Z %] || _ AntVow MorphSep ] ;

# convett back phone [Z] to letter g
define OrthG %[ Z %] -> g ;

# delete ç before morpheme separator and anterior vowel
define DeleteCedilla [ ç -> c || _ MorphSep AntVow ];

Expand Down Expand Up @@ -83,7 +117,10 @@ define OptDelEStemZ e (->) 0 || [z | s] _ s MorphSep z ;
# words with the stem ending in r,
# e.g. flores^zinhas (diminutive of "flor" 'flower' in plural)
# flores^zinhas => flors^zinhas
define OptDelEStemR e (->) 0 || r _ s MorphSep z ;
define OptDelEStemR e (->) 0 || Vow r _ s MorphSep z ;

# TODO: abdômen => abdômenes => abdomenezinhos
# => abdomenzinhos

# composing the two previous rules in one single FST
define OptDelE OptDelEStemZ .o. OptDelEStemR ;
Expand Down Expand Up @@ -113,24 +150,32 @@ define Unaccent [[á -> a] .o. [é -> e] .o. [ê -> e] .o. [ó -> o]
define AltRules NasalBilabAssim .o.
PhonC
.o.
PhonG
.o.
ThemVowDel
.o.
ChangeC
.o.
OrthC
.o.
ChangeG
ChangeG
.o.
OrthG
.o.
OptDelE
.o.
PluralSDeletion
.o.
SuffZDeletion
.o.
Protect
.o.
IDeletion
.o.
Unaccent
.o.
Reconv
.o.
DeleteCedilla
.o.
DelStemmS
Expand Down
Loading

0 comments on commit 3ad22c4

Please sign in to comment.