Skip to content

Commit

Permalink
Merge pull request #3604 from flairNLP/add_bavarian_wiki_dataset
Browse files Browse the repository at this point in the history
Add BarNER Dataset
  • Loading branch information
alanakbik authored Feb 3, 2025
2 parents e00e0ff + 4332d79 commit 3d24c35
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 0 deletions.
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
NER_ARABIC_ANER,
NER_ARABIC_AQMAR,
NER_BASQUE,
NER_BAVARIAN_WIKI,
NER_CHINESE_WEIBO,
NER_DANISH_DANE,
NER_ENGLISH_MOVIE_COMPLEX,
Expand Down Expand Up @@ -477,6 +478,7 @@
"NER_ARABIC_ANER",
"NER_ARABIC_AQMAR",
"NER_BASQUE",
"NER_BAVARIAN_WIKI",
"NER_CHINESE_WEIBO",
"NER_DANISH_DANE",
"NER_ENGLISH_MOVIE_COMPLEX",
Expand Down
91 changes: 91 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5530,3 +5530,94 @@ def __init__(
corpora,
name="masakha-pos-" + "-".join(languages),
)


class NER_BAVARIAN_WIKI(ColumnCorpus):
def __init__(
self,
fine_grained: bool = False,
revision: str = "main",
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the Bavarian NER Bavarian NER Dataset (BarNER).
The dataset was proposed in the 2024 LREC-COLING paper
"Sebastian, Basti, Wastl?! Recognizing Named Entities in Bavarian Dialectal Data" paper by Peng et al.
:param fine_grained: Defines if the fine-grained or coarse-grained (default) should be used.
:param revision: Defines the revision/commit of BarNER dataset, by default dataset from 'main' branch is used.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param in_memory: If True, keeps dataset in memory giving speedups in training.
"""
base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
dataset_name = self.__class__.__name__.lower()
data_folder = base_path / dataset_name
data_path = flair.cache_root / "datasets" / dataset_name

document_boundary_marker = "-DOCSTART-"

for split in ["train", "dev", "test"]:
# Get original version
original_split_filename = data_path / "original" / f"bar-wiki-{split}.tsv"
if not original_split_filename.is_file():
original_split_url = (
f"https://raw.githubusercontent.com/mainlp/BarNER/{revision}/data/BarNER-final/bar-wiki-{split}.tsv"
)
cached_path(original_split_url, data_path / "original")

# Add sentence boundary marker
modified_split_filename = data_path / f"bar-wiki-{split}.tsv"
if not modified_split_filename.is_file():
f_out = open(modified_split_filename, "w", encoding="utf-8")

with open(original_split_filename, encoding="utf-8") as f_p:
for line in f_p:
line = line.strip()
if line.startswith("# newdoc id = "):
f_out.write(f"{document_boundary_marker}\tO\n\n")
continue
if line.startswith("# "):
continue
f_out.write(f"{line}\n")
f_out.close()

columns = {0: "text", 1: "ner"}

label_name_map = None

if not fine_grained:
# Only allowed classes in course setting are: PER, LOC, ORG and MISC.
# All other NEs are normalized to O, except EVENT and WOA are normalized to MISC (cf. Table 3 of paper).
label_name_map = {
"EVENT": "MISC",
"EVENTderiv": "O",
"EVENTpart": "O",
"LANG": "O",
"LANGderiv": "O",
"LANGpart": "O",
"LOCderiv": "O",
"LOCpart": "O",
"MISCderiv": "O",
"MISCpart": "O",
"ORGderiv": "O",
"ORGpart": "O",
"PERderiv": "O",
"PERpart": "O",
"RELIGION": "O",
"RELIGIONderiv": "O",
"WOA": "MISC",
"WOAderiv": "O",
"WOApart": "O",
}

super().__init__(
data_folder,
columns,
in_memory=in_memory,
comment_symbol="# ",
document_separator_token="-DOCSTART-",
label_name_map=label_name_map,
**corpusargs,
)
23 changes: 23 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,29 @@ def test_german_mobie(tasks_base_path):
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


@pytest.mark.skip()
def test_bavarian_wiki(tasks_base_path):
corpus = flair.datasets.NER_BAVARIAN_WIKI()

ref_sentences = 3_577
ref_tokens = 75_690

actual_sentences = sum(
[1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)
actual_tokens = sum(
[len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)

assert ref_sentences == actual_sentences, (
f"Number of parsed sentences ({actual_sentences}) does not match with "
f"reported number of sentences ({ref_sentences})!"
)
assert (
ref_tokens == actual_tokens
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down

0 comments on commit 3d24c35

Please sign in to comment.