Merge pull request #3604 from flairNLP/add_bavarian_wiki_dataset

Add BarNER Dataset
flairNLP · Feb 3, 2025 · 3d24c35 · 3d24c35
2 parents e00e0ff + 4332d79
commit 3d24c35
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 0 deletions.
diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
@@ -185,6 +185,7 @@
     NER_ARABIC_ANER,
     NER_ARABIC_AQMAR,
     NER_BASQUE,
+    NER_BAVARIAN_WIKI,
     NER_CHINESE_WEIBO,
     NER_DANISH_DANE,
     NER_ENGLISH_MOVIE_COMPLEX,
@@ -477,6 +478,7 @@
     "NER_ARABIC_ANER",
     "NER_ARABIC_AQMAR",
     "NER_BASQUE",
+    "NER_BAVARIAN_WIKI",
     "NER_CHINESE_WEIBO",
     "NER_DANISH_DANE",
     "NER_ENGLISH_MOVIE_COMPLEX",

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -5530,3 +5530,94 @@ def __init__(
             corpora,
             name="masakha-pos-" + "-".join(languages),
         )
+
+
+class NER_BAVARIAN_WIKI(ColumnCorpus):
+    def __init__(
+        self,
+        fine_grained: bool = False,
+        revision: str = "main",
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        **corpusargs,
+    ) -> None:
+        """Initialize the Bavarian NER Bavarian NER Dataset (BarNER).
+
+        The dataset was proposed in the 2024 LREC-COLING paper
+        "Sebastian, Basti, Wastl?! Recognizing Named Entities in Bavarian Dialectal Data" paper by Peng et al.
+        :param fine_grained: Defines if the fine-grained or coarse-grained (default) should be used.
+        :param revision: Defines the revision/commit of BarNER dataset, by default dataset from 'main' branch is used.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
+        dataset_name = self.__class__.__name__.lower()
+        data_folder = base_path / dataset_name
+        data_path = flair.cache_root / "datasets" / dataset_name
+
+        document_boundary_marker = "-DOCSTART-"
+
+        for split in ["train", "dev", "test"]:
+            # Get original version
+            original_split_filename = data_path / "original" / f"bar-wiki-{split}.tsv"
+            if not original_split_filename.is_file():
+                original_split_url = (
+                    f"https://raw.githubusercontent.com/mainlp/BarNER/{revision}/data/BarNER-final/bar-wiki-{split}.tsv"
+                )
+                cached_path(original_split_url, data_path / "original")
+
+            # Add sentence boundary marker
+            modified_split_filename = data_path / f"bar-wiki-{split}.tsv"
+            if not modified_split_filename.is_file():
+                f_out = open(modified_split_filename, "w", encoding="utf-8")
+
+                with open(original_split_filename, encoding="utf-8") as f_p:
+                    for line in f_p:
+                        line = line.strip()
+                        if line.startswith("# newdoc id = "):
+                            f_out.write(f"{document_boundary_marker}\tO\n\n")
+                            continue
+                        if line.startswith("# "):
+                            continue
+                        f_out.write(f"{line}\n")
+                f_out.close()
+
+        columns = {0: "text", 1: "ner"}
+
+        label_name_map = None
+
+        if not fine_grained:
+            # Only allowed classes in course setting are: PER, LOC, ORG and MISC.
+            # All other NEs are normalized to O, except EVENT and WOA are normalized to MISC (cf. Table 3 of paper).
+            label_name_map = {
+                "EVENT": "MISC",
+                "EVENTderiv": "O",
+                "EVENTpart": "O",
+                "LANG": "O",
+                "LANGderiv": "O",
+                "LANGpart": "O",
+                "LOCderiv": "O",
+                "LOCpart": "O",
+                "MISCderiv": "O",
+                "MISCpart": "O",
+                "ORGderiv": "O",
+                "ORGpart": "O",
+                "PERderiv": "O",
+                "PERpart": "O",
+                "RELIGION": "O",
+                "RELIGIONderiv": "O",
+                "WOA": "MISC",
+                "WOAderiv": "O",
+                "WOApart": "O",
+            }
+
+        super().__init__(
+            data_folder,
+            columns,
+            in_memory=in_memory,
+            comment_symbol="# ",
+            document_separator_token="-DOCSTART-",
+            label_name_map=label_name_map,
+            **corpusargs,
+        )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -954,6 +954,29 @@ def test_german_mobie(tasks_base_path):
     ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"
 
 
+@pytest.mark.skip()
+def test_bavarian_wiki(tasks_base_path):
+    corpus = flair.datasets.NER_BAVARIAN_WIKI()
+
+    ref_sentences = 3_577
+    ref_tokens = 75_690
+
+    actual_sentences = sum(
+        [1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
+    )
+    actual_tokens = sum(
+        [len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
+    )
+
+    assert ref_sentences == actual_sentences, (
+        f"Number of parsed sentences ({actual_sentences}) does not match with "
+        f"reported number of sentences ({ref_sentences})!"
+    )
+    assert (
+        ref_tokens == actual_tokens
+    ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],