Skip to content

Commit

Permalink
tests: add basic sentence & token count test for new BarNER dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan-it committed Feb 1, 2025
1 parent f8186a3 commit dcd029b
Showing 1 changed file with 23 additions and 0 deletions.
23 changes: 23 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,29 @@ def test_german_mobie(tasks_base_path):
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


@pytest.mark.skip()
def test_bavarian_wiki(tasks_base_path):
corpus = flair.datasets.NER_BAVARIAN_WIKI()

ref_sentences = 3_577
ref_tokens = 75_690

actual_sentences = sum(
[1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)
actual_tokens = sum(
[len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)

assert ref_sentences == actual_sentences, (
f"Number of parsed sentences ({actual_sentences}) does not match with "
f"reported number of sentences ({ref_sentences})!"
)
assert (
ref_tokens == actual_tokens
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down

0 comments on commit dcd029b

Please sign in to comment.