diff --git a/Dockerfile b/Dockerfile index afe4551f0..afad09fb0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,18 @@ FROM python:3.9-slim -WORKDIR /app -COPY . /app - RUN apt-get update && apt-get install -y \ build-essential libsndfile1 \ && rm -rf /var/lib/apt/lists/* +WORKDIR /app +COPY . /app + RUN pip install -e . RUN python -m unidic download RUN python melo/init_downloads.py -CMD ["python", "./melo/app.py", "--host", "0.0.0.0", "--port", "8888"] \ No newline at end of file +# Copy entrypoint script and make it executable +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +# Set the entrypoint script +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] \ No newline at end of file diff --git a/docs/install.md b/docs/install.md index eea136783..7aa4b5efe 100644 --- a/docs/install.md +++ b/docs/install.md @@ -31,6 +31,7 @@ docker build -t melotts . ``` **Run Docker** +Run as a default Gradio app: ```bash docker run -it -p 8888:8888 melotts ``` @@ -38,6 +39,11 @@ If your local machine has GPU, then you can choose to run: ```bash docker run --gpus all -it -p 8888:8888 melotts ``` + +Run as a FastAPI streaming server: +```bash +docker run --gpus all -it -p 8888:8888 -e APP_MODE=api melotts +``` Then open [http://localhost:8888](http://localhost:8888) in your browser to use the app. ## Usage @@ -51,6 +57,44 @@ melo-ui # Or: python melo/app.py ``` +### Streaming API +One application for the streaming API could be for an AI assistant. The following block of code provides some guidance on how to read from the stream: +```python +import requests +import subprocess + +def stream_ffplay(audio_stream): + ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "2048", "-autoexit", "-"] + ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE) + + for chunk in audio_stream: + if chunk is not None: + ffplay_proc.stdin.write(chunk) + + # close on finish + ffplay_proc.stdin.close() + ffplay_proc.wait() + +def tts(text, speaker='EN-US', language='EN', speed=1): + res = requests.post( + "http://localhost:8888/stream", + json={ + "text": text, + "language": language, + "speed": speed, + "speaker": speaker + }, + stream=True, + ) + for chunk in res.iter_content(chunk_size=512): + if chunk: + yield chunk + +stream_ffplay( + tts("Ahoy there matey! How goes it?") +) +``` + ### CLI You may use the MeloTTS CLI to interact with MeloTTS. The CLI may be invoked using either `melotts` or `melo`. Here are some examples: diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 000000000..27b521d7f --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Default to FastAPI if no APP_MODE specified +APP_MODE=${APP_MODE:-fastapi} + +if [ "$APP_MODE" = "api" ]; then + exec uvicorn melo.fastapi_server:app --host "0.0.0.0" --port "8888" --reload +else + exec python ./melo/app.py --host "0.0.0.0" --port "8888" +fi \ No newline at end of file diff --git a/melo/fastapi_server.py b/melo/fastapi_server.py new file mode 100644 index 000000000..f9abe8a8b --- /dev/null +++ b/melo/fastapi_server.py @@ -0,0 +1,39 @@ +from fastapi import FastAPI, File, UploadFile +from pydantic import BaseModel +import io +from melo.api import TTS +from fastapi.responses import StreamingResponse + +app = FastAPI() + +# Initialize the TTS models as before +device = 'auto' +models = { + 'EN': TTS(language='EN', device=device), + 'ES': TTS(language='ES', device=device), + 'FR': TTS(language='FR', device=device), + 'ZH': TTS(language='ZH', device=device), + 'JP': TTS(language='JP', device=device), + 'KR': TTS(language='KR', device=device), +} + +class SynthesizePayload(BaseModel): + text: str = 'Ahoy there matey! There she blows!' + language: str = 'EN' + speaker: str = 'EN-US' + speed: float = 1.0 + +@app.post("/stream") +async def synthesize_stream(payload: SynthesizePayload): + language = payload.language + text = payload.text + speaker = payload.speaker or list(models[language].hps.data.spk2id.keys())[0] + speed = payload.speed + + def audio_stream(): + bio = io.BytesIO() + models[language].tts_to_file(text, models[language].hps.data.spk2id[speaker], bio, speed=speed, format='wav') + audio_data = bio.getvalue() + yield audio_data + + return StreamingResponse(audio_stream(), media_type="audio/wav") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index af4cb60f3..ae1c5dce9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,6 @@ langid==1.1.6 tqdm tensorboard==2.16.2 loguru==0.7.2 +fastapi +uvicorn +pydantic \ No newline at end of file