Convert Text to Speech Programmatically: Complete Developer Guide

Converting text to speech programmatically is an HTTP call. You POST text to an API endpoint, and get back an audio file. The details — chunking long content, handling formats, caching, streaming — are what this guide covers.

The Basic Pattern

Every TTS API follows the same shape:

POST /v1/tts
X-API-Key: <key>
Content-Type: application/json

{"text": "...", "voice": "...", "format": "mp3"}

→ binary audio data

Here's that pattern in three languages.

Python

Using the requests library:

import requests
import os

def text_to_speech(text: str, output_file: str, voice: str = "en-US-1"):
    response = requests.post(
        "https://api.speekoapp.com/v1/tts",
        headers={
            "X-API-Key": os.environ["SPEEKO_API_KEY"],
            "Content-Type": "application/json",
        },
        json={
            "text": text,
            "voice": voice,
            "format": "mp3",
        },
    )
    response.raise_for_status()

    with open(output_file, "wb") as f:
        f.write(response.content)
    print(f"Saved: {output_file}")

text_to_speech("Hello, this is a test.", "output.mp3")

Node.js

Using the native fetch API (Node 18+):

import { writeFileSync } from 'fs';

async function textToSpeech(text, outputFile, voice = 'en-US-1') {
  const response = await fetch('https://api.speekoapp.com/v1/tts', {
    method: 'POST',
    headers: {
      'X-API-Key': process.env.SPEEKO_API_KEY,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({ text, voice, format: 'mp3' }),
  });

  if (!response.ok) {
    throw new Error(`TTS error: ${response.status} ${await response.text()}`);
  }

  const buffer = Buffer.from(await response.arrayBuffer());
  writeFileSync(outputFile, buffer);
  console.log(`Saved: ${outputFile}`);
}

await textToSpeech('Hello from Node.js.', 'output.mp3');

curl

For shell scripts and quick testing:

curl -X POST https://api.speekoapp.com/v1/tts \
  -H "X-API-Key: $SPEEKO_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{"text": "Hello from curl.", "voice": "en-US-1", "format": "mp3"}' \
  --output output.mp3

# Play immediately on macOS:
afplay output.mp3

# Play on Linux:
mpg123 output.mp3

Handling Long Text

TTS APIs have per-request character limits (typically 3,000–10,000 characters). For longer content, split at sentence boundaries before sending:

import re
import os

def split_into_chunks(text: str, max_chars: int = 3000) -> list[str]:
    """Split text at sentence boundaries."""
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 > max_chars:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk = (current_chunk + " " + sentence).strip()

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

def long_text_to_speech(text: str, output_file: str):
    chunks = split_into_chunks(text)
    audio_chunks = []

    for i, chunk in enumerate(chunks):
        response = requests.post(
            "https://api.speekoapp.com/v1/tts",
            headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
            json={"text": chunk, "voice": "en-US-1", "format": "mp3"},
        )
        response.raise_for_status()
        audio_chunks.append(response.content)
        print(f"Chunk {i+1}/{len(chunks)} done")

    # Concatenate all chunks
    with open(output_file, "wb") as f:
        for chunk in audio_chunks:
            f.write(chunk)

    print(f"Saved {len(chunks)} chunks to {output_file}")

For cleaner audio at chunk boundaries, use ffmpeg to concatenate properly:

import subprocess
import tempfile

def concat_audio_files(chunk_files: list[str], output_file: str):
    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as list_file:
        for path in chunk_files:
            list_file.write(f"file '{path}'\n")
        list_path = list_file.name

    subprocess.run([
        "ffmpeg", "-f", "concat", "-safe", "0",
        "-i", list_path, "-c", "copy", output_file
    ], check=True)

Output Formats

Format	Flag	File Size	Best For
MP3	`"format": "mp3"`	Small	Web, podcasts, most use cases
WAV	`"format": "wav"`	Large	Audio editing, game dev assets
OGG	`"format": "ogg"`	Smaller than MP3	Web with bandwidth constraints

MP3 is the right default. Use WAV when you need to edit the audio or feed it into a DAW. OGG when you need smaller files and control the player (it's not supported everywhere).

Streaming Audio

For real-time playback (voice agents, live feedback), stream the response instead of waiting for the full file:

import requests
import os

def stream_to_file(text: str, output_file: str):
    with requests.post(
        "https://api.speekoapp.com/v1/tts",
        headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
        json={"text": text, "voice": "en-US-1", "format": "mp3"},
        stream=True,
    ) as response:
        response.raise_for_status()
        with open(output_file, "wb") as f:
            for chunk in response.iter_content(chunk_size=4096):
                f.write(chunk)
                # In a real streaming scenario, you'd play each chunk
                # as it arrives instead of writing to file

Caching

TTS isn't free. If you're synthesizing the same phrases repeatedly (UI labels, standard notifications, FAQ answers), cache the results:

import hashlib
from pathlib import Path

CACHE_DIR = Path(".tts_cache")
CACHE_DIR.mkdir(exist_ok=True)

def cached_tts(text: str, voice: str = "en-US-1") -> bytes:
    key = hashlib.sha256(f"{text}:{voice}".encode()).hexdigest()[:16]
    cache_file = CACHE_DIR / f"{key}.mp3"

    if cache_file.exists():
        return cache_file.read_bytes()

    response = requests.post(
        "https://api.speekoapp.com/v1/tts",
        headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
        json={"text": text, "voice": voice, "format": "mp3"},
    )
    response.raise_for_status()
    audio = response.content
    cache_file.write_bytes(audio)
    return audio

At scale, use a CDN or object storage (S3, GCS) instead of local filesystem cache.

Error Handling

Common errors and how to handle them:

import time

def tts_with_retry(text: str, max_attempts: int = 3) -> bytes:
    for attempt in range(max_attempts):
        try:
            response = requests.post(
                "https://api.speekoapp.com/v1/tts",
                headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
                json={"text": text, "voice": "en-US-1", "format": "mp3"},
                timeout=30,
            )

            if response.status_code == 429:
                # Rate limited — back off
                wait = 2 ** attempt
                print(f"Rate limited. Waiting {wait}s...")
                time.sleep(wait)
                continue

            if response.status_code == 413:
                raise ValueError(f"Text too long: {len(text)} chars")

            response.raise_for_status()
            return response.content

        except requests.Timeout:
            if attempt == max_attempts - 1:
                raise
            time.sleep(2 ** attempt)

    raise RuntimeError("Max retries exceeded")

Async Batch TTS with asyncio

For generating many audio files concurrently, asyncio + aiohttp outperforms sequential requests:

import asyncio
import aiohttp
import os

async def synthesize_async(session: aiohttp.ClientSession, text: str, output: str):
    async with session.post(
        "https://api.speekoapp.com/v1/tts",
        headers={"X-API-Key": os.environ["SPEEKO_API_KEY"]},
        json={"text": text, "voice": "am_michael", "format": "mp3"},
    ) as resp:
        resp.raise_for_status()
        data = await resp.read()
        with open(output, "wb") as f:
            f.write(data)

async def batch_synthesize(items: list[dict]):
    sem = asyncio.Semaphore(5)  # Max 5 concurrent requests

    async def bounded(item):
        async with sem:
            await synthesize_async(session, item["text"], item["output"])

    async with aiohttp.ClientSession() as session:
        await asyncio.gather(*[bounded(item) for item in items])

# Usage
items = [
    {"text": f"Product {i}: High quality widget with advanced features.", "output": f"product_{i}.mp3"}
    for i in range(50)
]
asyncio.run(batch_synthesize(items))

50 files that take 30 seconds serially complete in 6–8 seconds with 5 concurrent requests.

Convert Text to Speech Programmatically: Complete Developer Guide

Convert Text to Speech Programmatically: Complete Developer Guide

The Basic Pattern

Python

Node.js

curl

Handling Long Text

Output Formats

Streaming Audio

Caching

Error Handling

Async Batch TTS with asyncio

Get Started

Related articles

Natural Sounding TTS API: How to Get Human-Like Voice Quality

AI TTS API: Best Neural Text-to-Speech Options in 2025