From 838f1c737e436f64447e9b6b72071bdac518dc59 Mon Sep 17 00:00:00 2001 From: Alexander Thiess Date: Tue, 9 Sep 2025 06:48:51 +0200 Subject: [PATCH] first steps --- .python-version | 1 + CLAUDE.md | 149 +++++++++++++++++ README.md | 280 ++++++++++++++++++++++++++++++++ pyproject.toml | 12 ++ src/.env.example | 6 + src/data/models.json | 22 +++ src/endpoints/__init__.py | 0 src/endpoints/models.py | 97 +++++++++++ src/endpoints/v1/__init__.py | 0 src/endpoints/v1/chat.py | 72 ++++++++ src/endpoints/v1/completions.py | 64 ++++++++ src/endpoints/v1/embeddings.py | 45 +++++ src/endpoints/v1/misc.py | 97 +++++++++++ src/endpoints/v1/models.py | 24 +++ src/main.py | 77 +++++++++ src/models/__init__.py | 3 + src/models/model.py | 75 +++++++++ src/services/__init__.py | 3 + src/services/model_manager.py | 113 +++++++++++++ src/services/persistence.py | 67 ++++++++ uv.lock | 204 +++++++++++++++++++++++ 21 files changed, 1411 insertions(+) create mode 100644 .python-version create mode 100644 CLAUDE.md create mode 100644 pyproject.toml create mode 100644 src/.env.example create mode 100644 src/data/models.json create mode 100644 src/endpoints/__init__.py create mode 100644 src/endpoints/models.py create mode 100644 src/endpoints/v1/__init__.py create mode 100644 src/endpoints/v1/chat.py create mode 100644 src/endpoints/v1/completions.py create mode 100644 src/endpoints/v1/embeddings.py create mode 100644 src/endpoints/v1/misc.py create mode 100644 src/endpoints/v1/models.py create mode 100644 src/main.py create mode 100644 src/models/__init__.py create mode 100644 src/models/model.py create mode 100644 src/services/__init__.py create mode 100644 src/services/model_manager.py create mode 100644 src/services/persistence.py create mode 100644 uv.lock diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..24ee5b1 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..8bde428 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,149 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a vLLM proxy REST API that solves the limitation of vLLM only being able to load one model at a time per process. The proxy acts as a daemon that manages multiple vLLM instances and routes requests to the appropriate instance. + +## Key Architecture Decisions + +- **Main entry point**: `src/main.py` +- **Package manager**: uv (not pip or poetry) +- **Python version**: 3.13 +- **Configuration**: `.env` file for main configuration +- **Source organization**: All source files go in `src/` directory +- **Endpoint structure**: Endpoints are organized as separate modules +- **Data persistence**: Models saved to `data/models.json` (configurable via `DATA_DIR`) + +## Development Commands + +```bash +# Install dependencies +uv sync + +# Run the application from project root +uv run python src/main.py + +# Run on different port +APP_PORT=8081 uv run python src/main.py + +# Add a new dependency +uv add + +# Add a development dependency +uv add --dev +``` + +## API Endpoints + +### Model Management +- `GET /models` - List all models with full details +- `POST /models` - Create a new model +- `GET /models/{model_id}` - Get model details +- `PUT /models/{model_id}` - Update model +- `DELETE /models/{model_id}` - Delete model + +### OpenAI v1 Compatible - Implemented +- `GET /v1/models` - List models in OpenAI format +- `GET /v1/models/{model_id}` - Get specific model in OpenAI format + +### OpenAI v1 Compatible - Placeholders (TODO) +- `POST /v1/chat/completions` - Chat completions (supports streaming via `stream` parameter) +- `POST /v1/completions` - Text completions (supports streaming via `stream` parameter) +- `POST /v1/embeddings` - Generate embeddings + +### OpenAI v1 Compatible - Not Applicable +- `/v1/images/*` - Image generation (vLLM is text-only) +- `/v1/audio/*` - Audio endpoints (vLLM is text-only) +- `/v1/assistants` - Assistants API (beta feature) +- `/v1/fine_tuning/*` - Fine-tuning management +- `/v1/files` - File management +- `/v1/moderations` - Content moderation + +### Utility +- `GET /` - API info and endpoints +- `GET /health` - Health check +- `GET /docs` - Swagger UI documentation +- `GET /redoc` - ReDoc documentation + +## Project Structure + +``` +src/ +├── main.py # FastAPI application entry point +├── models/ +│ └── model.py # Model dataclass with vLLM configurations +├── services/ +│ ├── model_manager.py # Model lifecycle management +│ └── persistence.py # JSON file persistence +├── endpoints/ +│ ├── models.py # Model CRUD endpoints +│ └── v1/ # OpenAI v1 compatible endpoints +│ ├── models.py # Models listing +│ ├── chat.py # Chat completions +│ ├── completions.py # Text completions +│ ├── embeddings.py # Embeddings generation +│ └── misc.py # Other v1 endpoints +└── data/ # Persisted models (auto-created) + └── models.json +``` + +## Implementation Status + +### ✅ Completed +- [x] FastAPI application setup with CORS +- [x] Model dataclass with vLLM parameters +- [x] Model management endpoints (CRUD) +- [x] OpenAI v1 compatible `/v1/models` endpoint +- [x] Model persistence to JSON file +- [x] Port allocation for models +- [x] Environment variable configuration +- [x] All OpenAI v1 endpoint placeholders with proper request/response models +- [x] Streaming support structure (parameter-based, not separate endpoints) +- [x] Swagger/ReDoc API documentation + +### 🚧 High Priority TODO +- [ ] vLLM process spawning and management +- [ ] Implement actual chat completions logic (`/v1/chat/completions`) +- [ ] Implement actual text completions logic (`/v1/completions`) +- [ ] Server-Sent Events (SSE) streaming for both endpoints +- [ ] Request proxying to appropriate vLLM instance +- [ ] Model health monitoring and status updates +- [ ] Process cleanup on model deletion +- [ ] Automatic model loading on startup (spawn vLLM processes) + +### 🔄 Medium Priority TODO +- [ ] Embeddings endpoint implementation (`/v1/embeddings`) +- [ ] Load balancing for models with multiple instances +- [ ] Model configuration validation +- [ ] Error recovery and retry logic +- [ ] Graceful shutdown handling + +### 📊 Low Priority TODO +- [ ] Authentication/API keys +- [ ] Rate limiting +- [ ] Metrics and monitoring endpoints +- [ ] Content moderation endpoint +- [ ] Fine-tuning management (if applicable) + +## Model Configuration Fields + +The Model dataclass includes all vLLM parameters: +- `model`: HuggingFace model ID, local path, or URL +- `tensor_parallel_size`: GPU parallelism +- `pipeline_parallel_size`: Pipeline parallelism +- `max_model_len`: Maximum sequence length +- `dtype`: Data type (auto, float16, bfloat16, float32) +- `quantization`: Quantization method (awq, gptq, etc.) +- `trust_remote_code`: Allow remote code execution +- `gpu_memory_utilization`: GPU memory fraction (0-1) +- `max_num_seqs`: Maximum concurrent sequences + +## Important Notes + +- Models persist across server restarts in `data/models.json` +- Each model is allocated a unique port starting from 8001 +- Server runs on port 8000 by default (configurable via `APP_PORT`) +- All datetime objects are timezone-aware (UTC) +- Model status tracks lifecycle: loading, ready, error, unloading \ No newline at end of file diff --git a/README.md b/README.md index 297d22f..903f253 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,282 @@ # vLLM-Proxy +A REST API proxy that manages multiple vLLM instances, solving the limitation of vLLM only being able to load one model at a time per process. This daemon provides OpenAI-compatible endpoints while managing multiple model instances in the background. + +## Features + +- 🚀 **Multiple Model Management**: Run multiple vLLM models simultaneously +- 🔄 **OpenAI Compatible**: Drop-in replacement for OpenAI API v1 endpoints +- 💾 **Persistent Configuration**: Models persist across server restarts +- 🎯 **Automatic Routing**: Requests are automatically routed to the correct model instance +- 📊 **RESTful API**: Full CRUD operations for model management +- ⚡ **Fast & Async**: Built with FastAPI for high performance + +## Quick Start + +### Prerequisites + +- Python 3.13+ +- [uv](https://github.com/astral-sh/uv) package manager +- CUDA-capable GPU (for running vLLM models) + +### Installation + +```bash +# Clone the repository +git clone https://github.com/yourusername/vLLM-Proxy.git +cd vLLM-Proxy + +# Install dependencies +uv sync +``` + +### Running the Server + +```bash +# Start the proxy server +uv run python src/main.py + +# Or run on a different port +APP_PORT=8081 uv run python src/main.py +``` + +The server will start on `http://localhost:8000` by default. + +## API Usage + +### Model Management + +#### Create a Model + +```bash +curl -X POST http://localhost:8000/models \ + -H "Content-Type: application/json" \ + -d '{ + "name": "llama-3.2", + "model": "meta-llama/Llama-3.2-1B-Instruct", + "dtype": "float16", + "max_model_len": 4096 + }' +``` + +#### List Models + +```bash +# Full details (admin view) +curl http://localhost:8000/models + +# OpenAI compatible format +curl http://localhost:8000/v1/models +``` + +#### Update a Model + +```bash +curl -X PUT http://localhost:8000/models/{model_id} \ + -H "Content-Type: application/json" \ + -d '{ + "max_model_len": 8192, + "gpu_memory_utilization": 0.8 + }' +``` + +#### Delete a Model + +```bash +curl -X DELETE http://localhost:8000/models/{model_id} +``` + +### Chat Completions (Placeholder - TODO) + +```bash +# Non-streaming chat completion +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama-3.2", + "messages": [ + {"role": "user", "content": "Hello!"} + ], + "temperature": 0.7, + "max_tokens": 100 + }' + +# Streaming chat completion (when implemented) +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama-3.2", + "messages": [ + {"role": "user", "content": "Hello!"} + ], + "stream": true + }' +``` + +### Text Completions (Placeholder - TODO) + +```bash +# Non-streaming completion +curl -X POST http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama-3.2", + "prompt": "Once upon a time", + "max_tokens": 50, + "temperature": 0.7 + }' + +# Streaming completion (when implemented) +curl -X POST http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama-3.2", + "prompt": "Once upon a time", + "max_tokens": 50, + "stream": true + }' +``` + +### Embeddings (Placeholder - TODO) + +```bash +curl -X POST http://localhost:8000/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{ + "model": "text-embedding-model", + "input": "The food was delicious and the waiter was friendly." + }' +``` + +## Configuration + +### Environment Variables + +Create a `.env` file in the `src` directory: + +```env +# Server configuration +APP_HOST=0.0.0.0 +APP_PORT=8000 + +# Data directory for persistence +DATA_DIR=./data + +# Hugging Face token (for gated models) +HF_TOKEN=your_token_here +``` + +### Model Parameters + +When creating a model, you can configure all vLLM parameters: + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `model` | HuggingFace model ID, local path, or URL | Required | +| `tensor_parallel_size` | Number of GPUs for tensor parallelism | 1 | +| `pipeline_parallel_size` | Number of GPUs for pipeline parallelism | 1 | +| `max_model_len` | Maximum sequence length | Auto | +| `dtype` | Data type (auto, float16, bfloat16, float32) | auto | +| `quantization` | Quantization method (awq, gptq, etc.) | None | +| `trust_remote_code` | Allow remote code execution | false | +| `gpu_memory_utilization` | GPU memory fraction to use (0-1) | 0.9 | +| `max_num_seqs` | Maximum concurrent sequences | 256 | + +## Architecture + +``` +vLLM-Proxy + │ + ├── API Layer (FastAPI) + │ ├── /v1/* endpoints (OpenAI compatible) + │ └── /models/* endpoints (Management) + │ + ├── Model Manager + │ ├── Lifecycle management + │ ├── Port allocation + │ └── Persistence layer + │ + └── vLLM Instances (Coming Soon) + ├── Model A (port 8001) + ├── Model B (port 8002) + └── Model C (port 8003) +``` + +## API Documentation + +Once the server is running, you can access the interactive API documentation at: + +- **Swagger UI**: `http://localhost:8000/docs` +- **ReDoc**: `http://localhost:8000/redoc` + +## Development + +### Project Structure + +``` +src/ +├── main.py # FastAPI application +├── models/ # Data models +│ └── model.py # Model dataclass with vLLM configurations +├── services/ # Business logic +│ ├── model_manager.py # Model lifecycle management +│ └── persistence.py # JSON file persistence +├── endpoints/ # API endpoints +│ ├── models.py # Model CRUD operations +│ └── v1/ # OpenAI v1 compatible endpoints +│ ├── models.py # Models listing +│ ├── chat.py # Chat completions (placeholder) +│ ├── completions.py # Text completions (placeholder) +│ ├── embeddings.py # Embeddings (placeholder) +│ └── misc.py # Other v1 endpoints +└── data/ # Persistent storage + └── models.json # Saved model configurations +``` + +### Adding Dependencies + +```bash +# Add a runtime dependency +uv add package-name + +# Add a development dependency +uv add --dev package-name +``` + +## Roadmap + +### ✅ Completed +- [x] Model CRUD operations +- [x] OpenAI v1/models endpoint +- [x] Model persistence +- [x] All OpenAI v1 endpoint placeholders +- [x] Streaming support structure +- [x] Interactive API documentation + +### 🚧 High Priority +- [ ] vLLM process management +- [ ] Chat completions implementation +- [ ] Text completions implementation +- [ ] Server-Sent Events streaming +- [ ] Request proxying to vLLM instances + +### 🔄 Medium Priority +- [ ] Embeddings endpoint +- [ ] Model health monitoring +- [ ] Load balancing +- [ ] Error recovery + +### 📊 Low Priority +- [ ] Authentication/API keys +- [ ] Rate limiting +- [ ] Metrics and monitoring +- [ ] Content moderation + +## License + +MIT + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..707d16f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "vllm-proxy" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + "fastapi>=0.116.1", + "pydantic>=2.11.7", + "python-dotenv>=1.1.1", + "uvicorn>=0.35.0", +] diff --git a/src/.env.example b/src/.env.example new file mode 100644 index 0000000..c4f7435 --- /dev/null +++ b/src/.env.example @@ -0,0 +1,6 @@ +# Application settings +APP_HOST=0.0.0.0 +APP_PORT=8000 + +# vLLM settings (will be used later) +# VLLM_BASE_PORT=8001 \ No newline at end of file diff --git a/src/data/models.json b/src/data/models.json new file mode 100644 index 0000000..d813291 --- /dev/null +++ b/src/data/models.json @@ -0,0 +1,22 @@ +[ + { + "id": "8fbd5a04-6f76-44a3-8ae8-6f620c924a97", + "name": "test-persistence", + "model": "gpt2", + "status": "loading", + "created_at": "2025-09-09T04:11:46.622217+00:00", + "updated_at": "2025-09-09T04:11:46.622218+00:00", + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "max_model_len": null, + "dtype": "float16", + "quantization": null, + "trust_remote_code": false, + "gpu_memory_utilization": 0.9, + "max_num_seqs": 256, + "port": 8001, + "process_id": null, + "config": {}, + "capabilities": [] + } +] \ No newline at end of file diff --git a/src/endpoints/__init__.py b/src/endpoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/endpoints/models.py b/src/endpoints/models.py new file mode 100644 index 0000000..d8b25a5 --- /dev/null +++ b/src/endpoints/models.py @@ -0,0 +1,97 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field +from typing import Dict, Any, Optional, List +from services.model_manager import model_manager +from models import Model + + +class CreateModelRequest(BaseModel): + name: str = Field(..., description="Model name/identifier") + model: str = Field(..., description="HuggingFace model ID, local path, or URL") + tensor_parallel_size: int = Field(default=1, ge=1) + pipeline_parallel_size: int = Field(default=1, ge=1) + max_model_len: Optional[int] = Field(default=None, ge=1) + dtype: str = Field(default="auto") + quantization: Optional[str] = Field(default=None) + trust_remote_code: bool = Field(default=False) + gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1) + max_num_seqs: int = Field(default=256, ge=1) + config: Dict[str, Any] = Field(default_factory=dict) + capabilities: List[str] = Field(default_factory=list) + + +class UpdateModelRequest(BaseModel): + name: Optional[str] = None + model: Optional[str] = None + tensor_parallel_size: Optional[int] = Field(default=None, ge=1) + pipeline_parallel_size: Optional[int] = Field(default=None, ge=1) + max_model_len: Optional[int] = Field(default=None, ge=1) + dtype: Optional[str] = None + quantization: Optional[str] = None + trust_remote_code: Optional[bool] = None + gpu_memory_utilization: Optional[float] = Field(default=None, gt=0, le=1) + max_num_seqs: Optional[int] = Field(default=None, ge=1) + config: Optional[Dict[str, Any]] = None + capabilities: Optional[List[str]] = None + + +router = APIRouter() + + +@router.get("/models") +async def list_models() -> List[Dict[str, Any]]: + """List all models with full details""" + models = model_manager.list_models() + return [model.to_admin_format() for model in models] + + +@router.get("/models/{model_id}") +async def get_model(model_id: str) -> Dict[str, Any]: + """Get full details of a specific model""" + model = model_manager.get_model(model_id) + if not model: + raise HTTPException(status_code=404, detail=f"Model {model_id} not found") + return model.to_admin_format() + + +@router.post("/models") +async def create_model(request: CreateModelRequest) -> Dict[str, Any]: + """Create a new model""" + model = Model( + name=request.name, + model=request.model, + tensor_parallel_size=request.tensor_parallel_size, + pipeline_parallel_size=request.pipeline_parallel_size, + max_model_len=request.max_model_len, + dtype=request.dtype, + quantization=request.quantization, + trust_remote_code=request.trust_remote_code, + gpu_memory_utilization=request.gpu_memory_utilization, + max_num_seqs=request.max_num_seqs, + config=request.config, + capabilities=request.capabilities, + ) + + created_model = model_manager.create_model(model) + return created_model.to_admin_format() + + +@router.put("/models/{model_id}") +async def update_model(model_id: str, request: UpdateModelRequest) -> Dict[str, Any]: + """Update an existing model""" + updates = request.model_dump(exclude_unset=True) + updated_model = model_manager.update_model(model_id, updates) + + if not updated_model: + raise HTTPException(status_code=404, detail=f"Model {model_id} not found") + + return updated_model.to_admin_format() + + +@router.delete("/models/{model_id}") +async def delete_model(model_id: str) -> Dict[str, str]: + """Delete a model""" + if model_manager.delete_model(model_id): + return {"message": f"Model {model_id} deleted successfully"} + else: + raise HTTPException(status_code=404, detail=f"Model {model_id} not found") diff --git a/src/endpoints/v1/__init__.py b/src/endpoints/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/endpoints/v1/chat.py b/src/endpoints/v1/chat.py new file mode 100644 index 0000000..f2d019d --- /dev/null +++ b/src/endpoints/v1/chat.py @@ -0,0 +1,72 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field +from typing import List, Optional, Dict, Any, Union, Literal +from datetime import datetime + +router = APIRouter(prefix="/v1") + + +class ChatMessage(BaseModel): + role: Literal["system", "user", "assistant", "function"] + content: str + name: Optional[str] = None + function_call: Optional[Dict[str, Any]] = None + + +class ChatCompletionRequest(BaseModel): + model: str + messages: List[ChatMessage] + temperature: Optional[float] = Field(default=1.0, ge=0, le=2) + top_p: Optional[float] = Field(default=1.0, ge=0, le=1) + n: Optional[int] = Field(default=1, ge=1) + stream: Optional[bool] = False + stop: Optional[Union[str, List[str]]] = None + max_tokens: Optional[int] = None + presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2) + frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2) + logit_bias: Optional[Dict[str, float]] = None + user: Optional[str] = None + seed: Optional[int] = None + tools: Optional[List[Dict[str, Any]]] = None + tool_choice: Optional[Union[str, Dict[str, Any]]] = None + response_format: Optional[Dict[str, Any]] = None + + +@router.post("/chat/completions") +async def create_chat_completion(request: ChatCompletionRequest): + """ + Create a chat completion - OpenAI compatible endpoint + Handles both streaming and non-streaming responses based on request.stream + TODO: Implement actual vLLM chat completion logic + """ + + if request.stream: + # TODO: Implement Server-Sent Events (SSE) streaming + # Should return StreamingResponse with media_type="text/event-stream" + raise HTTPException( + status_code=501, + detail="Streaming chat completions not yet implemented" + ) + + # Non-streaming response + return { + "id": "chatcmpl-placeholder", + "object": "chat.completion", + "created": int(datetime.now().timestamp()), + "model": request.model, + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "This is a placeholder response. vLLM integration pending." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0 + } + } \ No newline at end of file diff --git a/src/endpoints/v1/completions.py b/src/endpoints/v1/completions.py new file mode 100644 index 0000000..f27b812 --- /dev/null +++ b/src/endpoints/v1/completions.py @@ -0,0 +1,64 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field +from typing import List, Optional, Dict, Any, Union +from datetime import datetime + +router = APIRouter(prefix="/v1") + + +class CompletionRequest(BaseModel): + model: str + prompt: Union[str, List[str], List[int], List[List[int]]] + suffix: Optional[str] = None + max_tokens: Optional[int] = 16 + temperature: Optional[float] = Field(default=1.0, ge=0, le=2) + top_p: Optional[float] = Field(default=1.0, ge=0, le=1) + n: Optional[int] = Field(default=1, ge=1) + stream: Optional[bool] = False + logprobs: Optional[int] = None + echo: Optional[bool] = False + stop: Optional[Union[str, List[str]]] = None + presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2) + frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2) + best_of: Optional[int] = Field(default=1, ge=1) + logit_bias: Optional[Dict[str, float]] = None + user: Optional[str] = None + seed: Optional[int] = None + + +@router.post("/completions") +async def create_completion(request: CompletionRequest): + """ + Create a text completion - OpenAI compatible endpoint + Handles both streaming and non-streaming responses based on request.stream + TODO: Implement actual vLLM completion logic + """ + + if request.stream: + # TODO: Implement Server-Sent Events (SSE) streaming + # Should return StreamingResponse with media_type="text/event-stream" + raise HTTPException( + status_code=501, + detail="Streaming completions not yet implemented" + ) + + # Non-streaming response + return { + "id": "cmpl-placeholder", + "object": "text_completion", + "created": int(datetime.now().timestamp()), + "model": request.model, + "choices": [ + { + "text": "This is a placeholder completion response.", + "index": 0, + "logprobs": None, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0 + } + } \ No newline at end of file diff --git a/src/endpoints/v1/embeddings.py b/src/endpoints/v1/embeddings.py new file mode 100644 index 0000000..55cf3d6 --- /dev/null +++ b/src/endpoints/v1/embeddings.py @@ -0,0 +1,45 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field +from typing import List, Optional, Union +from datetime import datetime + +router = APIRouter(prefix="/v1") + + +class EmbeddingRequest(BaseModel): + input: Union[str, List[str], List[int], List[List[int]]] + model: str + encoding_format: Optional[str] = Field(default="float", pattern="^(float|base64)$") + user: Optional[str] = None + + +@router.post("/embeddings") +async def create_embeddings(request: EmbeddingRequest): + """ + Create embeddings - OpenAI compatible endpoint + TODO: Implement actual embedding generation with vLLM or sentence-transformers + """ + # Check if model supports embeddings + # Note: vLLM primarily focuses on text generation, may need separate embedding models + + # Placeholder response + fake_embedding = [0.0] * 768 # Common embedding dimension + + inputs = request.input if isinstance(request.input, list) else [request.input] + + return { + "object": "list", + "data": [ + { + "object": "embedding", + "embedding": fake_embedding, + "index": i + } + for i in range(len(inputs)) + ], + "model": request.model, + "usage": { + "prompt_tokens": 0, + "total_tokens": 0 + } + } \ No newline at end of file diff --git a/src/endpoints/v1/misc.py b/src/endpoints/v1/misc.py new file mode 100644 index 0000000..786b512 --- /dev/null +++ b/src/endpoints/v1/misc.py @@ -0,0 +1,97 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from typing import List, Optional, Dict, Any +from datetime import datetime + +router = APIRouter(prefix="/v1") + + +# Files endpoint (for fine-tuning, not critical for vLLM proxy) +@router.get("/files") +async def list_files(): + """ + List files - OpenAI compatible endpoint + TODO: Decide if file management is needed for vLLM proxy + """ + return { + "object": "list", + "data": [] + } + + +# Fine-tuning endpoints (might not be applicable for vLLM proxy) +@router.get("/fine_tuning/jobs") +async def list_fine_tuning_jobs(): + """ + List fine-tuning jobs + TODO: Decide if fine-tuning management is needed + """ + return { + "object": "list", + "data": [], + "has_more": False + } + + +# Assistants API (beta, probably not needed for vLLM proxy) +@router.get("/assistants") +async def list_assistants(): + """ + List assistants - OpenAI compatible endpoint + Note: This is a beta feature in OpenAI, likely not needed for vLLM proxy + """ + raise HTTPException( + status_code=501, + detail="Assistants API not supported in vLLM proxy" + ) + + +# Images endpoint (not applicable for vLLM) +@router.post("/images/generations") +async def create_image(): + """ + Generate images - OpenAI compatible endpoint + Note: vLLM is for text generation, not image generation + """ + raise HTTPException( + status_code=501, + detail="Image generation not supported - vLLM is for text models only" + ) + + +# Audio endpoints (not applicable for vLLM) +@router.post("/audio/transcriptions") +async def create_transcription(): + """ + Transcribe audio - OpenAI compatible endpoint + Note: vLLM is for text generation, not audio processing + """ + raise HTTPException( + status_code=501, + detail="Audio transcription not supported - vLLM is for text models only" + ) + + +@router.post("/audio/speech") +async def create_speech(): + """ + Generate speech - OpenAI compatible endpoint + Note: vLLM is for text generation, not audio generation + """ + raise HTTPException( + status_code=501, + detail="Speech generation not supported - vLLM is for text models only" + ) + + +# Moderation endpoint +@router.post("/moderations") +async def create_moderation(): + """ + Check content moderation - OpenAI compatible endpoint + TODO: Could integrate a separate moderation model if needed + """ + raise HTTPException( + status_code=501, + detail="Content moderation not yet implemented" + ) \ No newline at end of file diff --git a/src/endpoints/v1/models.py b/src/endpoints/v1/models.py new file mode 100644 index 0000000..7bf43b5 --- /dev/null +++ b/src/endpoints/v1/models.py @@ -0,0 +1,24 @@ +from fastapi import APIRouter +from typing import Dict, Any +from services.model_manager import model_manager + +router = APIRouter(prefix="/v1") + + +@router.get("/models") +async def list_models() -> Dict[str, Any]: + """OpenAI-compatible models endpoint""" + models = model_manager.list_models() + return { + "object": "list", + "data": [model.to_openai_format() for model in models] + } + + +@router.get("/models/{model_id}") +async def get_model(model_id: str) -> Dict[str, Any]: + """Get a specific model in OpenAI format""" + model = model_manager.get_model(model_id) + if not model: + return {"error": {"message": f"Model {model_id} not found", "type": "invalid_request_error"}} + return model.to_openai_format() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..7bea78b --- /dev/null +++ b/src/main.py @@ -0,0 +1,77 @@ +from endpoints.v1 import models as v1_models +from endpoints.v1 import chat as v1_chat +from endpoints.v1 import completions as v1_completions +from endpoints.v1 import embeddings as v1_embeddings +from endpoints.v1 import misc as v1_misc +from endpoints import models + +import uvicorn +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from dotenv import load_dotenv +import os + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent)) + + +# Load environment variables +load_dotenv() + +# Create FastAPI app +app = FastAPI( + title="vLLM Proxy", + description="A proxy API for managing multiple vLLM instances", + version="0.1.0", +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Include routers +app.include_router(v1_models.router, tags=["OpenAI v1 - Models"]) +app.include_router(v1_chat.router, tags=["OpenAI v1 - Chat"]) +app.include_router(v1_completions.router, tags=["OpenAI v1 - Completions"]) +app.include_router(v1_embeddings.router, tags=["OpenAI v1 - Embeddings"]) +app.include_router(v1_misc.router, tags=["OpenAI v1 - Misc"]) +app.include_router(models.router, tags=["Model Management"]) + + +@app.get("/") +async def root(): + return { + "name": "vLLM Proxy", + "version": "0.1.0", + "endpoints": { + "v1": "/v1/models - OpenAI compatible models endpoint", + "models": "/models - Model management endpoints (CRUD)" + } + } + + +@app.get("/health") +async def health(): + return {"status": "healthy"} + + +def main(): + port = int(os.getenv("APP_PORT", "8000")) + host = os.getenv("APP_HOST", "0.0.0.0") + + uvicorn.run( + app, # Pass the app directly instead of string import + host=host, + port=port, + reload=False # Disable reload to avoid import issues + ) + + +if __name__ == "__main__": + main() diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..df0c860 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,3 @@ +from .model import Model, ModelStatus + +__all__ = ["Model", "ModelStatus"] \ No newline at end of file diff --git a/src/models/model.py b/src/models/model.py new file mode 100644 index 0000000..93fd0b8 --- /dev/null +++ b/src/models/model.py @@ -0,0 +1,75 @@ +from dataclasses import dataclass, field +from datetime import datetime, timezone +from enum import Enum +from typing import Optional, Dict, Any, List +from uuid import uuid4 + + +class ModelStatus(Enum): + LOADING = "loading" + READY = "ready" + ERROR = "error" + UNLOADING = "unloading" + + +@dataclass +class Model: + id: str = field(default_factory=lambda: str(uuid4())) + name: str = "" + model: str = "" # HuggingFace ID, local path, or URL + status: ModelStatus = ModelStatus.LOADING + created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + # vLLM specific configurations + tensor_parallel_size: int = 1 + pipeline_parallel_size: int = 1 + max_model_len: Optional[int] = None + dtype: str = "auto" + quantization: Optional[str] = None + trust_remote_code: bool = False + gpu_memory_utilization: float = 0.9 + max_num_seqs: int = 256 + + # Process management + port: Optional[int] = None + process_id: Optional[int] = None + + # Metadata + config: Dict[str, Any] = field(default_factory=dict) + capabilities: List[str] = field(default_factory=list) + + def to_openai_format(self) -> Dict[str, Any]: + """Convert to OpenAI API compatible format for /v1/models endpoint""" + return { + "id": self.id, + "object": "model", + "created": int(self.created_at.timestamp()), + "owned_by": "vllm-proxy", + "permission": [], + "root": self.name, + "parent": None, + } + + def to_admin_format(self) -> Dict[str, Any]: + """Full model details for admin endpoints""" + return { + "id": self.id, + "name": self.name, + "model": self.model, + "status": self.status.value, + "created_at": self.created_at.isoformat(), + "updated_at": self.updated_at.isoformat(), + "tensor_parallel_size": self.tensor_parallel_size, + "pipeline_parallel_size": self.pipeline_parallel_size, + "max_model_len": self.max_model_len, + "dtype": self.dtype, + "quantization": self.quantization, + "trust_remote_code": self.trust_remote_code, + "gpu_memory_utilization": self.gpu_memory_utilization, + "max_num_seqs": self.max_num_seqs, + "port": self.port, + "process_id": self.process_id, + "config": self.config, + "capabilities": self.capabilities, + } diff --git a/src/services/__init__.py b/src/services/__init__.py new file mode 100644 index 0000000..591317a --- /dev/null +++ b/src/services/__init__.py @@ -0,0 +1,3 @@ +from .model_manager import ModelManager + +__all__ = ["ModelManager"] \ No newline at end of file diff --git a/src/services/model_manager.py b/src/services/model_manager.py new file mode 100644 index 0000000..f9ee5c1 --- /dev/null +++ b/src/services/model_manager.py @@ -0,0 +1,113 @@ +from typing import Dict, List, Optional +from datetime import datetime, timezone +from models import Model, ModelStatus +from services.persistence import persistence_manager + + +class ModelManager: + def __init__(self): + self.models: Dict[str, Model] = {} + self.next_port = 8001 + self._load_models() + + def list_models(self) -> List[Model]: + """List all registered models""" + return list(self.models.values()) + + def get_model(self, model_id: str) -> Optional[Model]: + """Get a specific model by ID""" + return self.models.get(model_id) + + def create_model(self, model: Model) -> Model: + """Create a new model""" + model.port = self._allocate_port() + model.status = ModelStatus.LOADING + model.created_at = datetime.now(timezone.utc) + model.updated_at = datetime.now(timezone.utc) + self.models[model.id] = model + self._save_models() + return model + + def update_model(self, model_id: str, updates: Dict) -> Optional[Model]: + """Update an existing model""" + if model_id not in self.models: + return None + + model = self.models[model_id] + + # Update allowed fields + allowed_fields = { + "name", "model", "tensor_parallel_size", "pipeline_parallel_size", + "max_model_len", "dtype", "quantization", "trust_remote_code", + "gpu_memory_utilization", "max_num_seqs", "config", "capabilities" + } + + for field, value in updates.items(): + if field in allowed_fields and value is not None: + setattr(model, field, value) + + model.updated_at = datetime.now(timezone.utc) + self._save_models() + return model + + def delete_model(self, model_id: str) -> bool: + """Delete a model""" + if model_id in self.models: + model = self.models[model_id] + if model.port: + self._release_port(model.port) + del self.models[model_id] + self._save_models() + return True + return False + + def _allocate_port(self) -> int: + """Allocate a port for a new model""" + port = self.next_port + self.next_port += 1 + return port + + def _release_port(self, port: int) -> None: + """Release a port when a model is deleted""" + # In a real implementation, we might want to track and reuse ports + pass + + def _save_models(self) -> None: + """Save all models to disk""" + models_data = [model.to_admin_format() for model in self.models.values()] + persistence_manager.save_models(models_data) + + def _load_models(self) -> None: + """Load models from disk on startup""" + models_data = persistence_manager.load_models() + for model_data in models_data: + # Reconstruct Model object from saved data + model = Model( + id=model_data.get("id"), + name=model_data.get("name", ""), + model=model_data.get("model", ""), + status=ModelStatus(model_data.get("status", "loading")), + created_at=model_data.get("created_at"), + updated_at=model_data.get("updated_at"), + tensor_parallel_size=model_data.get("tensor_parallel_size", 1), + pipeline_parallel_size=model_data.get("pipeline_parallel_size", 1), + max_model_len=model_data.get("max_model_len"), + dtype=model_data.get("dtype", "auto"), + quantization=model_data.get("quantization"), + trust_remote_code=model_data.get("trust_remote_code", False), + gpu_memory_utilization=model_data.get("gpu_memory_utilization", 0.9), + max_num_seqs=model_data.get("max_num_seqs", 256), + port=model_data.get("port"), + process_id=model_data.get("process_id"), + config=model_data.get("config", {}), + capabilities=model_data.get("capabilities", []), + ) + self.models[model.id] = model + + # Update next_port to avoid conflicts + if model.port and model.port >= self.next_port: + self.next_port = model.port + 1 + + +# Global instance +model_manager = ModelManager() diff --git a/src/services/persistence.py b/src/services/persistence.py new file mode 100644 index 0000000..e879d8f --- /dev/null +++ b/src/services/persistence.py @@ -0,0 +1,67 @@ +import json +import os +from pathlib import Path +from typing import Dict, Any, List +from datetime import datetime +import logging + +logger = logging.getLogger(__name__) + + +class PersistenceManager: + def __init__(self, data_dir: str = None): + if data_dir is None: + # Use absolute path to ensure consistency regardless of where script is run from + default_data = Path(__file__).parent.parent / "data" + data_dir = os.getenv("DATA_DIR", str(default_data)) + + self.data_dir = Path(data_dir) + self.data_dir.mkdir(parents=True, exist_ok=True) + self.models_file = self.data_dir / "models.json" + + def save_models(self, models: List[Dict[str, Any]]) -> None: + """Save models to JSON file""" + try: + # Convert datetime objects to ISO format strings + serializable_models = [] + for model in models: + model_copy = model.copy() + if "created_at" in model_copy and isinstance(model_copy["created_at"], datetime): + model_copy["created_at"] = model_copy["created_at"].isoformat() + if "updated_at" in model_copy and isinstance(model_copy["updated_at"], datetime): + model_copy["updated_at"] = model_copy["updated_at"].isoformat() + serializable_models.append(model_copy) + + with open(self.models_file, 'w') as f: + json.dump(serializable_models, f, indent=2) + + logger.info(f"Saved {len(models)} models to {self.models_file}") + except Exception as e: + logger.error(f"Failed to save models: {e}") + + def load_models(self) -> List[Dict[str, Any]]: + """Load models from JSON file""" + if not self.models_file.exists(): + logger.info("No existing models file found") + return [] + + try: + with open(self.models_file, 'r') as f: + models = json.load(f) + + # Convert ISO format strings back to datetime objects + for model in models: + if "created_at" in model and isinstance(model["created_at"], str): + model["created_at"] = datetime.fromisoformat(model["created_at"]) + if "updated_at" in model and isinstance(model["updated_at"], str): + model["updated_at"] = datetime.fromisoformat(model["updated_at"]) + + logger.info(f"Loaded {len(models)} models from {self.models_file}") + return models + except Exception as e: + logger.error(f"Failed to load models: {e}") + return [] + + +# Global instance +persistence_manager = PersistenceManager() diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..0de70b1 --- /dev/null +++ b/uv.lock @@ -0,0 +1,204 @@ +version = 1 +revision = 3 +requires-python = ">=3.13" + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anyio" +version = "4.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f1/b4/636b3b65173d3ce9a38ef5f0522789614e590dab6a8d505340a4efe4c567/anyio-4.10.0.tar.gz", hash = "sha256:3f3fae35c96039744587aa5b8371e7e8e603c0702999535961dd336026973ba6", size = 213252, upload-time = "2025-08-04T08:54:26.451Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" }, +] + +[[package]] +name = "click" +version = "8.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "fastapi" +version = "0.116.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "starlette" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/d7/6c8b3bfe33eeffa208183ec037fee0cce9f7f024089ab1c5d12ef04bd27c/fastapi-0.116.1.tar.gz", hash = "sha256:ed52cbf946abfd70c5a0dccb24673f0670deeb517a88b3544d03c2a6bf283143", size = 296485, upload-time = "2025-07-11T16:22:32.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, +] + +[[package]] +name = "pydantic" +version = "2.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.33.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, + { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, + { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, + { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, + { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, + { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, + { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, + { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, + { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, + { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/b0/4bc07ccd3572a2f9df7e6782f52b0c6c90dcbb803ac4a167702d7d0dfe1e/python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab", size = 41978, upload-time = "2025-06-24T04:21:07.341Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc", size = 20556, upload-time = "2025-06-24T04:21:06.073Z" }, +] + +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + +[[package]] +name = "starlette" +version = "0.47.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/15/b9/cc3017f9a9c9b6e27c5106cc10cc7904653c3eec0729793aec10479dd669/starlette-0.47.3.tar.gz", hash = "sha256:6bc94f839cc176c4858894f1f8908f0ab79dfec1a6b8402f6da9be26ebea52e9", size = 2584144, upload-time = "2025-08-24T13:36:42.122Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/fd/901cfa59aaa5b30a99e16876f11abe38b59a1a2c51ffb3d7142bb6089069/starlette-0.47.3-py3-none-any.whl", hash = "sha256:89c0778ca62a76b826101e7c709e70680a1699ca7da6b44d38eb0a7e61fe4b51", size = 72991, upload-time = "2025-08-24T13:36:40.887Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" }, +] + +[[package]] +name = "uvicorn" +version = "0.35.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/42/e0e305207bb88c6b8d3061399c6a961ffe5fbb7e2aa63c9234df7259e9cd/uvicorn-0.35.0.tar.gz", hash = "sha256:bc662f087f7cf2ce11a1d7fd70b90c9f98ef2e2831556dd078d131b96cc94a01", size = 78473, upload-time = "2025-06-28T16:15:46.058Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/e2/dc81b1bd1dcfe91735810265e9d26bc8ec5da45b4c0f6237e286819194c3/uvicorn-0.35.0-py3-none-any.whl", hash = "sha256:197535216b25ff9b785e29a0b79199f55222193d47f820816e7da751e9bc8d4a", size = 66406, upload-time = "2025-06-28T16:15:44.816Z" }, +] + +[[package]] +name = "vllm-proxy" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "fastapi" }, + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "uvicorn" }, +] + +[package.metadata] +requires-dist = [ + { name = "fastapi", specifier = ">=0.116.1" }, + { name = "pydantic", specifier = ">=2.11.7" }, + { name = "python-dotenv", specifier = ">=1.1.1" }, + { name = "uvicorn", specifier = ">=0.35.0" }, +]