diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index d54ff35e01..30fa00fc67 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -185,6 +185,7 @@ NER_ARABIC_ANER, NER_ARABIC_AQMAR, NER_BASQUE, + NER_BAVARIAN_WIKI, NER_CHINESE_WEIBO, NER_DANISH_DANE, NER_ENGLISH_MOVIE_COMPLEX, @@ -477,6 +478,7 @@ "NER_ARABIC_ANER", "NER_ARABIC_AQMAR", "NER_BASQUE", + "NER_BAVARIAN_WIKI", "NER_CHINESE_WEIBO", "NER_DANISH_DANE", "NER_ENGLISH_MOVIE_COMPLEX", diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 80fc6d38ba..863446c1cc 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5530,3 +5530,94 @@ def __init__( corpora, name="masakha-pos-" + "-".join(languages), ) + + +class NER_BAVARIAN_WIKI(ColumnCorpus): + def __init__( + self, + fine_grained: bool = False, + revision: str = "main", + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + **corpusargs, + ) -> None: + """Initialize the Bavarian NER Bavarian NER Dataset (BarNER). + + The dataset was proposed in the 2024 LREC-COLING paper + "Sebastian, Basti, Wastl?! Recognizing Named Entities in Bavarian Dialectal Data" paper by Peng et al. + :param fine_grained: Defines if the fine-grained or coarse-grained (default) should be used. + :param revision: Defines the revision/commit of BarNER dataset, by default dataset from 'main' branch is used. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) + dataset_name = self.__class__.__name__.lower() + data_folder = base_path / dataset_name + data_path = flair.cache_root / "datasets" / dataset_name + + document_boundary_marker = "-DOCSTART-" + + for split in ["train", "dev", "test"]: + # Get original version + original_split_filename = data_path / "original" / f"bar-wiki-{split}.tsv" + if not original_split_filename.is_file(): + original_split_url = ( + f"https://raw.githubusercontent.com/mainlp/BarNER/{revision}/data/BarNER-final/bar-wiki-{split}.tsv" + ) + cached_path(original_split_url, data_path / "original") + + # Add sentence boundary marker + modified_split_filename = data_path / f"bar-wiki-{split}.tsv" + if not modified_split_filename.is_file(): + f_out = open(modified_split_filename, "w", encoding="utf-8") + + with open(original_split_filename, encoding="utf-8") as f_p: + for line in f_p: + line = line.strip() + if line.startswith("# newdoc id = "): + f_out.write(f"{document_boundary_marker}\tO\n\n") + continue + if line.startswith("# "): + continue + f_out.write(f"{line}\n") + f_out.close() + + columns = {0: "text", 1: "ner"} + + label_name_map = None + + if not fine_grained: + # Only allowed classes in course setting are: PER, LOC, ORG and MISC. + # All other NEs are normalized to O, except EVENT and WOA are normalized to MISC (cf. Table 3 of paper). + label_name_map = { + "EVENT": "MISC", + "EVENTderiv": "O", + "EVENTpart": "O", + "LANG": "O", + "LANGderiv": "O", + "LANGpart": "O", + "LOCderiv": "O", + "LOCpart": "O", + "MISCderiv": "O", + "MISCpart": "O", + "ORGderiv": "O", + "ORGpart": "O", + "PERderiv": "O", + "PERpart": "O", + "RELIGION": "O", + "RELIGIONderiv": "O", + "WOA": "MISC", + "WOAderiv": "O", + "WOApart": "O", + } + + super().__init__( + data_folder, + columns, + in_memory=in_memory, + comment_symbol="# ", + document_separator_token="-DOCSTART-", + label_name_map=label_name_map, + **corpusargs, + ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 8e6a7019b0..9acd753337 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -954,6 +954,29 @@ def test_german_mobie(tasks_base_path): ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!" +@pytest.mark.skip() +def test_bavarian_wiki(tasks_base_path): + corpus = flair.datasets.NER_BAVARIAN_WIKI() + + ref_sentences = 3_577 + ref_tokens = 75_690 + + actual_sentences = sum( + [1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"] + ) + actual_tokens = sum( + [len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"] + ) + + assert ref_sentences == actual_sentences, ( + f"Number of parsed sentences ({actual_sentences}) does not match with " + f"reported number of sentences ({ref_sentences})!" + ) + assert ( + ref_tokens == actual_tokens + ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!" + + def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path): corpus = MultiFileJsonlCorpus( train_files=[tasks_base_path / "jsonl/train.jsonl"],