1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
#!/bin/sh
# Copyright 2026 danix <danix@danix.xyz>
# All rights reserved.
#
# Redistribution and use of this script, with or without modification, is
# permitted provided that the following conditions are met:
#
# 1. Redistributions of this script must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# SERVER OPTIONS
# many options can be set as env variables.
# See https://github.com/ggml-org/llama.cpp/tree/master/tools/server
RUNAS_USER=
RUNAS_GROUP=users
LLSRV=/usr/bin/llama-server
#MODEL_DIR=
MODEL= # which model to use
HOST=127.0.0.1 # ip to bind the server to
PORT=8181 # port for the server to listen to
CONTEXT= #size of context
TEMP=1.0
TOP_K=20
TOP_P=0.95
MIN_P=0.00
SLEEP_IDLE_SECONDS=500 # after how many seconds to unload the model from memory
REASONING_FORMAT=deepseek
GPU_LAYERS=all
SPLIT_MODE=none
NO_CONTEXT_SHIFT='--no-context-shift'
LOG_FILE=/var/log/llama-server/server.log
LLAMA_ARGS="--ctx-size $CONTEXT \
--jinja \
--temp $TEMP \
--top-k $TOP_K \
--top-p $TOP_P \
--min-p $MIN_P \
--host $HOST --port $PORT \
--reasoning-format $REASONING_FORMAT \
-sm $SPLIT_MODE \
$NO_CONTEXT_SHIFT \
-ngl $GPU_LAYERS \
--sleep-idle-seconds $SLEEP_IDLE_SECONDS \
-m $MODEL" # alternatively one could set the model directory and pass --models-dir to the script
case "$1" in
stop)
if /usr/bin/pgrep -f "$LLSRV" >/dev/null; then
echo "Stopping llama-server..."
killall llama-server 2>/dev/null
else
echo "llama-server is not running..."
exit 1
fi
;;
start)
echo "Starting llama-server..."
if [[ -d $(dirname $LOG_FILE) ]]; then
mkdir -p $(dirname $LOG_FILE)
touch $LOG_FILE
chown -R ${RUNAS_USER}:${RUNAS_GROUP} $(dirname $LOG_FILE)
fi
su $RUNAS_USER -c "$LLSRV $LLAMA_ARGS --log-file $LOG_FILE -lv 3 --log-timestamps" 1>/dev/null 2>&1 &
$0 status
;;
restart)
$0 stop
sleep 1
$0 start
;;
status)
if /usr/bin/pgrep -f "$LLSRV" >/dev/null; then
echo "llama-server is running"
echo "Browser interface is at:"
echo "http://${HOST}:${PORT}"
fi
;;
*)
echo "usage: $0 { start | stop | status | restart }" >&2
exit 1
;;
esac
|