-
Notifications
You must be signed in to change notification settings - Fork 82
/
Copy pathserver.sh
executable file
·234 lines (204 loc) · 6.41 KB
/
server.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/bin/bash
set -e
WS=`cd $(dirname $0)/../.. && pwd`
REL=--release
LOOP=
BUILD=
ADD_ARGS=
R_LOG=info,tokenizers=error,rllm=debug,aicirt=info
VER=
mkdir -p "$WS/target"
BIN="$WS/target"
if [ -f "$WS/rllm/tch-cuda/cutlass/README.md" ] ; then
:
else
(cd $WS && git submodule update --init --recursive)
fi
while [ "$1" != "" ] ; do
case "$1" in
--release )
REL=--release
;;
--debug )
REL=
;;
--loop )
LOOP=1
;;
--timer )
ADD_ARGS="-A--wasm-timer-resolution-us=1"
;;
--cuda )
if [ "$CPP" = 1 ] ; then
VER="$VER --features cuda"
ADD_ARGS="--gpu-layers 1000"
else
echo "--cuda only valid for llama.cpp"
exit 1
fi
;;
--trace )
R_LOG=info,tokenizers=error,rllm=trace,aicirt=info,llama_cpp_low=trace
;;
--trace-rt )
R_LOG=info,tokenizers=error,rllm=trace,aicirt=trace,llama_cpp_low=trace
;;
* )
break
;;
esac
shift
done
EXPECTED=$WS/rllm/rllm-cuda/expected
if [ "$CPP" = 1 ] ; then
BIN_NAME=rllm-llamacpp
FOLDER_NAME=rllm-llamacpp
case "$1" in
phi2 )
ARGS="-m https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf -t phi -w $EXPECTED/phi-2/cats.safetensors -s test_maxtol=0.8 -s test_avgtol=0.3"
;;
orca )
ARGS="-m https://huggingface.co/TheBloke/Orca-2-13B-GGUF/blob/main/orca-2-13b.Q8_0.gguf -t orca -w $EXPECTED/orca/cats.safetensors"
;;
mistral )
ARGS="-m https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/mistral-7b-instruct-v0.2.Q5_K_M.gguf"
;;
mixtral )
ARGS="-m https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/blob/main/mixtral-8x7b-instruct-v0.1.Q6_K.gguf"
;;
code70 )
ARGS="-m https://huggingface.co/TheBloke/CodeLlama-70B-Instruct-GGUF/blob/main/codellama-70b-instruct.Q5_K_M.gguf"
;;
sql )
ARGS="-m https://huggingface.co/TheBloke/sqlcoder2-GGUF/blob/main/sqlcoder2.Q5_K_M.gguf -t defog/sqlcoder2"
;;
https* )
ARGS="-m $1"
;;
build )
BUILD=1
;;
* )
SELF="server.sh"
cat <<EOF
usage: $SELF [--loop] [--cuda] [--debug] [model_name] [rllm_args...]
model_name can a HuggingFace URL pointing to a .gguf file, or one of the following:
phi2 https://huggingface.co/TheBloke/phi-2-GGUF/blob/main/phi-2.Q8_0.gguf
orca https://huggingface.co/TheBloke/Orca-2-13B-GGUF/blob/main/orca-2-13b.Q8_0.gguf
mistral https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/mistral-7b-instruct-v0.2.Q5_K_M.gguf
mixtral https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/blob/main/mixtral-8x7b-instruct-v0.1.Q6_K.gguf
code70 https://huggingface.co/TheBloke/CodeLlama-70B-Instruct-GGUF/blob/main/codellama-70b-instruct.Q5_K_M.gguf
Additionally, "$SELF build" will just build the server, and not run a model.
--cuda try to build llama.cpp against installed CUDA
--loop restart server when it crashes and store logs in ./logs
--debug don't build in --release mode
Try $SELF phi2 --help to see available rllm_args
EOF
exit 1
;;
esac
else
BIN_NAME=rllm-cuda
FOLDER_NAME=rllm-cuda
case "$1" in
phi )
ARGS="-m microsoft/phi-1_5@refs/pr/66 -t phi -w $EXPECTED/phi-1_5/cats.safetensors"
;;
phi2 )
ARGS="-m microsoft/phi-2@d3186761bf5c4409f7679359284066c25ab668ee -t phi -w $EXPECTED/phi-2/cats.safetensors"
;;
7 | 7b )
ARGS="-m NousResearch/Llama-2-7b-hf -t llama -w $EXPECTED/llama/cats.safetensors"
;;
code )
ARGS="-m codellama/CodeLlama-13b-Instruct-hf -t llama16 -w $EXPECTED/codellama/cats.safetensors"
;;
code34 )
# OOM in hf transformers - can't generate testcases...
ARGS="-m codellama/CodeLlama-34b-Instruct-hf -t llama"
;;
orca )
ARGS="-m microsoft/Orca-2-13b@refs/pr/22 -t orca -w $EXPECTED/orca/cats.safetensors"
;;
build )
BUILD=1
REL=--release
;;
* )
SELF="server.sh"
cat <<EOF
usage: $SELF [--loop] [--debug] [model_name] [rllm_args...]
model_name can a HuggingFace URL model ID (including optional revision after @), or one of the following:
orca microsoft/Orca-2-13b@refs/pr/22
phi microsoft/phi-1_5@refs/pr/66
phi2 microsoft/phi-2@d3186761bf5c4409f7679359284066c25ab668ee
7b NousResearch/Llama-2-7b-hf
code codellama/CodeLlama-13b-Instruct-hf
code34 codellama/CodeLlama-34b-Instruct-hf
Additionally, "$SELF build" will just build the server, and not run a model.
--loop restart server when it crashes and store logs in ./logs
--debug don't build in --release mode
Try $SELF phi2 --help to see available rllm_args
EOF
exit 1
;;
esac
fi
shift
if [ "X$CUDA_VISIBLE_DEVICES" = "X" ] ; then
P=`ps -ax|grep 'aicir[t]\|rllm-serve[r]|rll[m]-cpp' | awk '{print $1}' | xargs echo`
if [ "X$P" != "X" ] ; then
echo "KILL $P"
kill $P
fi
else
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
fi
ARGS="--verbose --aicirt $BIN/release/aicirt $ARGS $ADD_ARGS"
(cd $WS/aicirt && cargo build --release)
(cd $WS/rllm/$FOLDER_NAME && cargo build $REL $VER)
if [ "X$REL" = "X" ] ; then
BIN_SERVER=$BIN/debug/$BIN_NAME
else
BIN_SERVER=$BIN/release/$BIN_NAME
fi
if echo "$*" | grep -q -- --profile-step ; then
rm -f profile.ncu-rep report1.*
cargo build --release
RUST_LOG=info \
nsys profile -c cudaProfilerApi \
$BIN_SERVER \
$ARGS "$@"
nsys stats ./report1.nsys-rep > tmp/perf.txt
echo "Opening tmp/perf.txt in VSCode; use Alt-Z to toggle word wrap"
code tmp/perf.txt
exit
fi
if [ "$BUILD" = "1" ] ; then
exit
fi
if [ "$CPP" != "1" ] && [ `uname` = Darwin ] ; then
mkdir -p tmp
TP=tmp/torch-path.txt
if test -f $TP ; then
:
else
python3 -c "from torch.utils import cpp_extension; print(cpp_extension.library_paths()[0])" > $TP
fi
TP=`cat $TP`
echo "Updating torch RPATH: $TP on $BIN_SERVER"
install_name_tool -add_rpath $TP $BIN_SERVER
fi
export RUST_BACKTRACE=1
export RUST_LOG=$R_LOG
echo "running $BIN_SERVER $ARGS $@"
if [ "$LOOP" = "" ] ; then
$BIN_SERVER $ARGS "$@"
exit $?
fi
set +e
while : ; do
$BIN_SERVER --daemon $ARGS "$@" 2>&1 | rotatelogs -e -D ./logs/%Y-%m-%d-%H_%M_%S.txt 3600
echo "restarting..."
sleep 2
done