tests: add basic sentence & token count test for new BarNER dataset

flairNLP · Feb 1, 2025 · dcd029b · dcd029b
1 parent f8186a3
commit dcd029b
Showing 1 changed file with 23 additions and 0 deletions.
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -954,6 +954,29 @@ def test_german_mobie(tasks_base_path):
     ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"
 
 
+@pytest.mark.skip()
+def test_bavarian_wiki(tasks_base_path):
+    corpus = flair.datasets.NER_BAVARIAN_WIKI()
+
+    ref_sentences = 3_577
+    ref_tokens = 75_690
+
+    actual_sentences = sum(
+        [1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
+    )
+    actual_tokens = sum(
+        [len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
+    )
+
+    assert ref_sentences == actual_sentences, (
+        f"Number of parsed sentences ({actual_sentences}) does not match with "
+        f"reported number of sentences ({ref_sentences})!"
+    )
+    assert (
+        ref_tokens == actual_tokens
+    ), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],