# source: https://towardsdatascience.com
import asyncio
import json
import time
from typing import Optional, List
from pydantic import BaseModel, Field
from starlette.responses import StreamingResponse
from fastapi import FastAPI, HTTPException, Request
app = FastAPI(title="OpenAI-compatible API")
# data models
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: Optional[str] = "mock-gpt-model"
messages: List[Message]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False
async def _resp_async_generator(text_resp: str):
# let's pretend every word is a token and return it over time
tokens = text_resp.split(" ")
for i, token in enumerate(tokens):
chunk = {
"id": i,
"object": "chat.completion.chunk",
"created": time.time(),
"model": request.model,
"choices": [{"delta": {"content": token + " "}}],
}
yield f"data: {json.dumps(chunk)}\n\n"
await asyncio.sleep(1)
yield "data: [DONE]\n\n"
@app.post("/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
if request.messages:
resp_content = (
"As a mock AI Assitant, I can only echo your last message: "
+ request.messages[-1].content
)
else:
resp_content = "As a mock AI Assitant, I can only echo your last message, but there wasn't one!"
if request.stream:
return StreamingResponse(
_resp_async_generator(resp_content), media_type="application/x-ndjson"
)
return {
"id": "1337",
"object": "chat.completion",
"created": time.time(),
"model": request.model,
"choices": [{"message": Message(role="assistant", content=resp_content)}],
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
# Direct generation
from openai import OpenAI
# init client and connect to localhost server
client = OpenAI(
api_key="fake-api-key",
base_url="http://localhost:8000" # change the default port if needed
)
# call API
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="gpt-1337-turbo-pro-max",
)
# print the top "choice"
print(chat_completion.choices[0].message.content)
# Streaming chunks
from openai import OpenAI
# init client and connect to localhost server
client = OpenAI(
api_key="fake-api-key",
base_url="http://localhost:8000" # change the default port if needed
)
stream = client.chat.completions.create(
model="mock-gpt-model",
messages=[{"role": "user", "content": "Say this is a test"}],
stream=True,
)
for chunk in stream:
print(chunk.choices[0].delta.content or "")