huggingface
diff --git a/‎.github/workflows/tests.yaml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/tests.yaml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 6 additions & 4 deletions b/‎Dockerfile‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎Dockerfile_amd‎
Lines changed: 4 additions & 3 deletions b/‎Dockerfile_amd‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎Dockerfile_intel‎
Lines changed: 6 additions & 4 deletions b/‎Dockerfile_intel‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎flake.lock‎
Lines changed: 3 additions & 3 deletions b/‎flake.lock‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎server/Makefile‎
Lines changed: 7 additions & 14 deletions b/‎server/Makefile‎
Lines changed: 7 additions & 14 deletions
@@ -44,10 +44,14 @@ jobs:
         run: |
           sudo apt update
           sudo apt install python3.11-dev -y
+          pip install -U pip uv
+          uv venv
+          source ./.venv/bin/activate
           make install-cpu
       - name: Run server tests
         run: |
-          pip install pytest
+          source ./.venv/bin/activate
+          uv pip install pytest
           export HF_TOKEN=${{ secrets.HF_TOKEN }}
           pytest -s -vv server/tests
       - name: Pre-commit checks
 
@@ -224,17 +224,19 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
 COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
 
 # Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
+# RUN pip install einops --no-cache-dir
 
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_cuda.txt && \
-    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
-    pip install nvidia-nccl-cu12==2.22.3
+    python -c "from text_generation_server.pb import generate_pb2" && \
+    pip install -U pip uv && \
+    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    uv pip install nvidia-nccl-cu12==2.22.3
 
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
 
@@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
     /opt/conda/bin/conda clean -ya
 
 # Install flash-attention, torch dependencies
-RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
 
 RUN conda install mkl=2021
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
@@ -318,10 +318,11 @@ COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 
@@ -108,10 +108,11 @@ RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
@@ -211,10 +212,11 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 
@@ -9,36 +9,29 @@ include Makefile-exllamav2
 include Makefile-flashinfer
 
 unit-tests:
+	pip install -U pip uv
+	uv pip install -e ".[dev]"
 	pytest -s -vv -m "not private" tests
 
 gen-server:
 	# Compile protos
-	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
+	pip install -U pip uv
+	uv pip install ".[gen]"
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
 	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
 	touch text_generation_server/pb/__init__.py
 
 install-server: gen-server
-	pip install pip --upgrade
-	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
+	uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
 
 
 install: install-cuda
 	echo "Installed server"
 
 install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
-	pip install -e ".[attention,bnb,marlin,moe]"
-	pip install nvidia-nccl-cu12==2.22.3
+	uv pip install -e ".[attention,bnb,marlin,moe]"
+	uv pip install nvidia-nccl-cu12==2.22.3
 
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
-
-run-dev:
-	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
-
-export-requirements:
-	poetry export -o requirements_cuda.txt --without-hashes
-	poetry export -o requirements_rocm.txt --without-hashes
-	poetry export -o requirements_intel.txt --without-hashes