example-settings.toml

[model]
vllm.model = "Qwen/Qwen2-72B-Instruct-AWQ"
vllm.host_url = "http://localhost:8000/v1/"
vllm.api_key = "sk-wowow-wowow"
vllm.use = false # Use vllm
vllm.use_qc = true

groq.api_key = ""
groq.model = "llama3-70b-8192"
groq.use = false # Use groq
groq.use_qc = false

ollama.model = 'qwen2:72b-instruct-q3_K_L'
ollama.host_url = ""
ollama.use = false # Use ollama
ollama.use_qc = false

anthropic.api_key = 'YOUR_API_KEY'
anthropic.model = 'claude-3-sonnet-20240229'
anthropic.use = false # Use the Anthropic model to generate data
anthropic.use_qc = false

openai.api_key = 'YOUR_API_KEY'
openai.model = 'gpt-4-turbo'
openai.use = false # Use the OpenAI model to generate data (cannot be used with the Anthropic model)
openai.use_qc = false

[general]
output_dir = 'output-qwen'
language = 'th'
wiki_lang = 'th' # Language to use for the Wikipedia API
use_wiki = true # Use the Wikipedia API to get articles for the tasks

num_topics = 1000
topic_generation_temperature = 0.85
topic_generation_max_tokens = 400
custom_system_prompt = "ตรวจสอบให้แน่ใจว่าการตอบสนองและเอาต์พุตทั้งหมดของคุณ นอกเหนือจากคีย์รูปแบบที่กำหนดไว้ล่วงหน้า เป็นภาษาไทย จงตอบเป็นภาษาไทยเสมอ" # Put something here if you want to overrride the default system prompt: "Ensure that ALL of your response and output, other than pre-specified format keys, is in the {language} language. Always output in {language} even when input is in other languages."

llm_task_types = [
    'question_answering',
    'multiple_choice',
    'summarization',
    'translation',
    'brainstorming',
    'conversation',
    'jokes',
    'debate'
]

# Filters to apply to the response
response_filters = [
  'my programming',
  'openai',
  'language model',
  'large language',
  'as an? (ai|generative language|gpt|bot)',
  'illegal and dangerous',
  "i do(n't| not) (possess|have|exhibit) (personal|consciousness|subjective)",
  'personal (feelings|thoughts|emotions|desires|experiences|goals|objective|belief)',
  "(can('t| ?not)|w(on't|will not)|unable.?) (\\w+\\s)+(with (that|your)|your \\w+|provide)",
  'my limitations',
  'the limitations of my',
  'my abilities',
  'violates my',
  "i (can('t| ?not)|w(on't|will not)|am (not |un)able.?).{0,30}(you are|you're|your )",
  'please note that',
  'flesch'
]

[culture]
num_topics = 2000
enabled = true
# prompt = 'You are a native Thai person with expert knowledge of Thai culture, history, language, and customs. Ensure that everything you act, do, say, and generate matches with this fact.'
prompt = 'คุณเป็นคนไทยโดยกำเนิดที่มีความรู้ความเชี่ยวชาญเกี่ยวกับวัฒนธรรม ประวัติศาสตร์ ภาษา และประเพณีไทย ตรวจสอบให้แน่ใจว่าทุกสิ่งที่คุณกระทำ ทำ พูด และสร้างตรงกับข้อเท็จจริงนี้'


# Configuration for the different tasks
[tasks]
num_threads_per_task = 5
context_styles = [ # Used contextual tasks (e.g. summarization, translation)
    "news article",
    "blog post",
    "text messages",
    "fictional short story",
    "video transcript",
    "song",
    "poem",
    "scientific study",
    "medical report",
    "social media post with replies",
    "email",
    "tweet",
    "how-to article"
]

    [tasks.question_answering]
    count = 25000
    batch_size = 10

    temperature = 0.3
    max_tokens = 4000

    [tasks.multiple_choice]
    count = 25000
    batch_size = 10

    temperature = 0.4
    max_tokens = 4000

    [tasks.summarization]
    count = 25000
    batch_size = 10
    summary_styles = [
        "bullet point",
        "paragraph",
        "numbered list",
    ]

    temperature = 0.35
    max_tokens = 4000

    [tasks.brainstorming]
    count = 25000
    batch_size = 10

    temperature = 0.8
    max_tokens = 4000

    [tasks.conversation]
    count = 25000
    batch_size = 10

    temperature = 0.8
    max_tokens = 4000

[cleaning] # Configuration for the cleaning process
remove_duplicates = true
remove_empty_instructions = true
remove_empty_outputs = true

use_cosine_filter = true
cosine_similarity_threshold = 0.95
embed_model = "BAAI/bge-m3"

[quality_control]
temperature = 0.8
max_tokens = 1500