-
Notifications
You must be signed in to change notification settings - Fork 2
/
example-settings.toml
147 lines (123 loc) · 4.5 KB
/
example-settings.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
[model]
vllm.model = "Qwen/Qwen2-72B-Instruct-AWQ"
vllm.host_url = "http://localhost:8000/v1/"
vllm.api_key = "sk-wowow-wowow"
vllm.use = false # Use vllm
vllm.use_qc = true
groq.api_key = ""
groq.model = "llama3-70b-8192"
groq.use = false # Use groq
groq.use_qc = false
ollama.model = 'qwen2:72b-instruct-q3_K_L'
ollama.host_url = ""
ollama.use = false # Use ollama
ollama.use_qc = false
anthropic.api_key = 'YOUR_API_KEY'
anthropic.model = 'claude-3-sonnet-20240229'
anthropic.use = false # Use the Anthropic model to generate data
anthropic.use_qc = false
openai.api_key = 'YOUR_API_KEY'
openai.model = 'gpt-4-turbo'
openai.use = false # Use the OpenAI model to generate data (cannot be used with the Anthropic model)
openai.use_qc = false
[general]
output_dir = 'output-qwen'
language = 'th'
wiki_lang = 'th' # Language to use for the Wikipedia API
use_wiki = true # Use the Wikipedia API to get articles for the tasks
num_topics = 1000
topic_generation_temperature = 0.85
topic_generation_max_tokens = 400
custom_system_prompt = "ตรวจสอบให้แน่ใจว่าการตอบสนองและเอาต์พุตทั้งหมดของคุณ นอกเหนือจากคีย์รูปแบบที่กำหนดไว้ล่วงหน้า เป็นภาษาไทย จงตอบเป็นภาษาไทยเสมอ" # Put something here if you want to overrride the default system prompt: "Ensure that ALL of your response and output, other than pre-specified format keys, is in the {language} language. Always output in {language} even when input is in other languages."
llm_task_types = [
'question_answering',
'multiple_choice',
'summarization',
'translation',
'brainstorming',
'conversation',
'jokes',
'debate'
]
# Filters to apply to the response
response_filters = [
'my programming',
'openai',
'language model',
'large language',
'as an? (ai|generative language|gpt|bot)',
'illegal and dangerous',
"i do(n't| not) (possess|have|exhibit) (personal|consciousness|subjective)",
'personal (feelings|thoughts|emotions|desires|experiences|goals|objective|belief)',
"(can('t| ?not)|w(on't|will not)|unable.?) (\\w+\\s)+(with (that|your)|your \\w+|provide)",
'my limitations',
'the limitations of my',
'my abilities',
'violates my',
"i (can('t| ?not)|w(on't|will not)|am (not |un)able.?).{0,30}(you are|you're|your )",
'please note that',
'flesch'
]
[culture]
num_topics = 2000
enabled = true
# prompt = 'You are a native Thai person with expert knowledge of Thai culture, history, language, and customs. Ensure that everything you act, do, say, and generate matches with this fact.'
prompt = 'คุณเป็นคนไทยโดยกำเนิดที่มีความรู้ความเชี่ยวชาญเกี่ยวกับวัฒนธรรม ประวัติศาสตร์ ภาษา และประเพณีไทย ตรวจสอบให้แน่ใจว่าทุกสิ่งที่คุณกระทำ ทำ พูด และสร้างตรงกับข้อเท็จจริงนี้'
# Configuration for the different tasks
[tasks]
num_threads_per_task = 5
context_styles = [ # Used contextual tasks (e.g. summarization, translation)
"news article",
"blog post",
"text messages",
"fictional short story",
"video transcript",
"song",
"poem",
"scientific study",
"medical report",
"social media post with replies",
"email",
"tweet",
"how-to article"
]
[tasks.question_answering]
count = 25000
batch_size = 10
temperature = 0.3
max_tokens = 4000
[tasks.multiple_choice]
count = 25000
batch_size = 10
temperature = 0.4
max_tokens = 4000
[tasks.summarization]
count = 25000
batch_size = 10
summary_styles = [
"bullet point",
"paragraph",
"numbered list",
]
temperature = 0.35
max_tokens = 4000
[tasks.brainstorming]
count = 25000
batch_size = 10
temperature = 0.8
max_tokens = 4000
[tasks.conversation]
count = 25000
batch_size = 10
temperature = 0.8
max_tokens = 4000
[cleaning] # Configuration for the cleaning process
remove_duplicates = true
remove_empty_instructions = true
remove_empty_outputs = true
use_cosine_filter = true
cosine_similarity_threshold = 0.95
embed_model = "BAAI/bge-m3"
[quality_control]
temperature = 0.8
max_tokens = 1500