Convert Text to Speech Programmatically: Complete Developer Guide
Converting text to speech programmatically is an HTTP call. You POST text to an API endpoint, and get back an audio file. The details — chunking long content, handling formats, caching, streaming — are what this guide covers.
The Basic Pattern
Every TTS API follows the same shape:
POST /v1/tts
X-API-Key: <key>
Content-Type: application/json
{"text": "...", "voice": "...", "format": "mp3"}
→ binary audio dataHere's that pattern in three languages.
Python
Using the requests library:
import requests
import os
def text_to_speech(text: str, output_file: str, voice: str = "en-US-1"):
response = requests.post(
"https://api.speekoapp.com/v1/tts",
headers={
"X-API-Key": os.environ["SPEEKO_API_KEY"],
"Content-Type": "application/json",
},
json={
"text": text,
"voice": voice,
"format": "mp3",
},
)
response.raise_for_status()
with open(output_file, "wb") as f:
f.write(response.content)
print(f"Saved: {output_file}")
text_to_speech("Hello, this is a test.", "output.mp3")Node.js
Using the native fetch API (Node 18+):
import { writeFileSync } from 'fs';
async function textToSpeech(text, outputFile, voice = 'en-US-1') {
const response = await fetch('https://api.speekoapp.com/v1/tts', {
method: 'POST',
headers: {
'X-API-Key': process.env.SPEEKO_API_KEY,
'Content-Type': 'application/json',
},
body: JSON.stringify({ text, voice, format: 'mp3' }),
});
if (!response.ok) {
throw new Error(`TTS error: ${response.status} ${await response.text()}`);
}
const buffer = Buffer.from(await response.arrayBuffer());
writeFileSync(outputFile, buffer);
console.log(`Saved: ${outputFile}`);
}
await textToSpeech('Hello from Node.js.', 'output.mp3');curl
For shell scripts and quick testing:
curl -X POST https://api.speekoapp.com/v1/tts \
-H "X-API-Key: $SPEEKO_API_KEY" \
-H "Content-Type: application/json" \
-d '{"text": "Hello from curl.", "voice": "en-US-1", "format": "mp3"}' \
--output output.mp3
# Play immediately on macOS:
afplay output.mp3
# Play on Linux:
mpg123 output.mp3Handling Long Text
TTS APIs have per-request character limits (typically 3,000–10,000 characters). For longer content, split at sentence boundaries before sending:
import re
import os
def split_into_chunks(text: str, max_chars: int = 3000) -> list[str]:
"""Split text at sentence boundaries."""
sentences = re.split(r'(?<=[.!?])\s+', text.strip())
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) + 1 > max_chars:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
else:
current_chunk = (current_chunk + " " + sentence).strip()
if current_chunk:
chunks.append(current_chunk)
return chunks
def long_text_to_speech(text: str, output_file: str):
chunks = split_into_chunks(text)
audio_chunks = []
for i, chunk in enumerate(chunks):
response = requests.post(
"https://api.speekoapp.com/v1/tts",
headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
json={"text": chunk, "voice": "en-US-1", "format": "mp3"},
)
response.raise_for_status()
audio_chunks.append(response.content)
print(f"Chunk {i+1}/{len(chunks)} done")
# Concatenate all chunks
with open(output_file, "wb") as f:
for chunk in audio_chunks:
f.write(chunk)
print(f"Saved {len(chunks)} chunks to {output_file}")For cleaner audio at chunk boundaries, use ffmpeg to concatenate properly:
import subprocess
import tempfile
def concat_audio_files(chunk_files: list[str], output_file: str):
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as list_file:
for path in chunk_files:
list_file.write(f"file '{path}'\n")
list_path = list_file.name
subprocess.run([
"ffmpeg", "-f", "concat", "-safe", "0",
"-i", list_path, "-c", "copy", output_file
], check=True)Output Formats
| Format | Flag | File Size | Best For |
|---|---|---|---|
| MP3 | "format": "mp3" |
Small | Web, podcasts, most use cases |
| WAV | "format": "wav" |
Large | Audio editing, game dev assets |
| OGG | "format": "ogg" |
Smaller than MP3 | Web with bandwidth constraints |
MP3 is the right default. Use WAV when you need to edit the audio or feed it into a DAW. OGG when you need smaller files and control the player (it's not supported everywhere).
Streaming Audio
For real-time playback (voice agents, live feedback), stream the response instead of waiting for the full file:
import requests
import os
def stream_to_file(text: str, output_file: str):
with requests.post(
"https://api.speekoapp.com/v1/tts",
headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
json={"text": text, "voice": "en-US-1", "format": "mp3"},
stream=True,
) as response:
response.raise_for_status()
with open(output_file, "wb") as f:
for chunk in response.iter_content(chunk_size=4096):
f.write(chunk)
# In a real streaming scenario, you'd play each chunk
# as it arrives instead of writing to fileCaching
TTS isn't free. If you're synthesizing the same phrases repeatedly (UI labels, standard notifications, FAQ answers), cache the results:
import hashlib
from pathlib import Path
CACHE_DIR = Path(".tts_cache")
CACHE_DIR.mkdir(exist_ok=True)
def cached_tts(text: str, voice: str = "en-US-1") -> bytes:
key = hashlib.sha256(f"{text}:{voice}".encode()).hexdigest()[:16]
cache_file = CACHE_DIR / f"{key}.mp3"
if cache_file.exists():
return cache_file.read_bytes()
response = requests.post(
"https://api.speekoapp.com/v1/tts",
headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
json={"text": text, "voice": voice, "format": "mp3"},
)
response.raise_for_status()
audio = response.content
cache_file.write_bytes(audio)
return audioAt scale, use a CDN or object storage (S3, GCS) instead of local filesystem cache.
Error Handling
Common errors and how to handle them:
import time
def tts_with_retry(text: str, max_attempts: int = 3) -> bytes:
for attempt in range(max_attempts):
try:
response = requests.post(
"https://api.speekoapp.com/v1/tts",
headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
json={"text": text, "voice": "en-US-1", "format": "mp3"},
timeout=30,
)
if response.status_code == 429:
# Rate limited — back off
wait = 2 ** attempt
print(f"Rate limited. Waiting {wait}s...")
time.sleep(wait)
continue
if response.status_code == 413:
raise ValueError(f"Text too long: {len(text)} chars")
response.raise_for_status()
return response.content
except requests.Timeout:
if attempt == max_attempts - 1:
raise
time.sleep(2 ** attempt)
raise RuntimeError("Max retries exceeded")Async Batch TTS with asyncio
For generating many audio files concurrently, asyncio + aiohttp outperforms sequential requests:
import asyncio
import aiohttp
import os
async def synthesize_async(session: aiohttp.ClientSession, text: str, output: str):
async with session.post(
"https://api.speekoapp.com/v1/tts",
headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
json={"text": text, "voice": "am_michael", "format": "mp3"},
) as resp:
resp.raise_for_status()
data = await resp.read()
with open(output, "wb") as f:
f.write(data)
async def batch_synthesize(items: list[dict]):
sem = asyncio.Semaphore(5) # Max 5 concurrent requests
async def bounded(item):
async with sem:
await synthesize_async(session, item["text"], item["output"])
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[bounded(item) for item in items])
# Usage
items = [
{"text": f"Product {i}: High quality widget with advanced features.", "output": f"product_{i}.mp3"}
for i in range(50)
]
asyncio.run(batch_synthesize(items))50 files that take 30 seconds serially complete in 6–8 seconds with 5 concurrent requests.
Get Started
Sign up at speekoapp.com/register — $5 free credit, no card required. Your first synthesis request takes under a minute.
Related: Text to Speech API Node.js Integration, Python SDK Deep Dive, Reduce TTS Costs with Caching.