diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 7b2c077ea4..721c29bdab 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -34,6 +34,7 @@ "reasoning_bg_oa": "0x22almostEvil/reasoning_bg_oa", "reasoning_gsm_qna_oa": "0x22almostEvil/reasoning-gsm-qna-oa", "semantics_ws_qna_oa": "0x22almostEvil/semantics-ws-qna-oa", + "alpaca-fa-instruction": "pourmand1376/alpaca-fa-instruction", } SAFETY_DATASETS = { diff --git a/data/datasets/alpaca-fa-instruction/README.md b/data/datasets/alpaca-fa-instruction/README.md new file mode 100644 index 0000000000..ba76e71d10 --- /dev/null +++ b/data/datasets/alpaca-fa-instruction/README.md @@ -0,0 +1,2 @@ +This is a persian instruction dataset. The dataset is uploaded +[here](https://huggingface.co/datasets/pourmand1376/alpaca-fa-instruction). diff --git a/data/datasets/alpaca-fa-multi/README.md b/data/datasets/alpaca-fa-multi/README.md new file mode 100644 index 0000000000..8128647b41 --- /dev/null +++ b/data/datasets/alpaca-fa-multi/README.md @@ -0,0 +1,4 @@ +This is an multi-turn persian dataset which is in +[orca-chat](https://huggingface.co/datasets/shahules786/orca-chat) format. It is +published in +[huggingface](https://huggingface.co/datasets/pourmand1376/alpaca-fa-multi). diff --git a/model/model_training/custom_datasets/instruction.py b/model/model_training/custom_datasets/instruction.py index 7b6ad39787..a746940dc0 100644 --- a/model/model_training/custom_datasets/instruction.py +++ b/model/model_training/custom_datasets/instruction.py @@ -32,6 +32,7 @@ "evol_instruct_code": "nickrosh/Evol-Instruct-Code-80k-v1", "evol-codealpaca-v1": "theblackcat102/evol-codealpaca-v1", "cot_submix_original": "conceptofmind/cot_submix_original", + "alpaca-fa-instruction": "pourmand1376/alpaca-fa-instruction", }