{"$schema":"https://json-schema.org/draft/2020-12/schema","additionalProperties":false,"properties":{"audio_format":{"description":"Format hint for raw/headerless audio. Required for pcm, mulaw, alaw. Omit for container formats (mp3, wav, etc.) — xAI auto-detects them.","enum":["pcm","mulaw","alaw"],"type":"string"},"channels":{"description":"Number of audio channels (2–8). Required only for multichannel raw audio; auto-detected for container formats.","maximum":8,"minimum":2,"type":"integer"},"diarize":{"description":"When true, enables speaker diarization. Each word in the response includes a `speaker` integer identifying the detected speaker.","type":"boolean"},"file":{"description":"Audio file as a data URI (data:audio/...;base64,...) or an HTTPS URL the gateway fetches and uploads. Supported container formats: flac, mp3, mp4, m4a, mkv, ogg, opus, wav, aac. Raw formats (pcm, mulaw, alaw) also accepted — supply audio_format and sample_rate. Gateway-side size limit: 25 MB. Mutually exclusive with `url`.","type":"string"},"filler_words":{"description":"When true, filler words (uh, um, er) are included in the transcript. Defaults to false — filler words are removed.","type":"boolean"},"format":{"description":"When true, enables Inverse Text Normalization — spoken numbers and currencies are converted to written form (e.g. \"one hundred dollars\" → \"$100\"). Requires language to be set.","type":"boolean"},"keyterm":{"description":"Key terms to bias transcription toward (e.g. product names, proper nouns). Each term up to 50 characters, max 100 terms. Sent as repeated form fields: keyterm=Term+One&keyterm=Term+Two.","items":{"maxLength":50,"type":"string"},"maxItems":100,"type":"array"},"language":{"description":"Language code (e.g. \"en\", \"fr\", \"de\"). Used with format=true to enable Inverse Text Normalization. xAI transcribes in any language regardless — supplying this enables number/currency formatting in the transcript.","type":"string"},"multichannel":{"description":"When true, each audio channel is transcribed independently. Results are returned in the `channels` array. Requires channels ≥ 2.","type":"boolean"},"sample_rate":{"description":"Sample rate in Hz. Required when audio_format is set.","maximum":9007199254740991,"minimum":-9007199254740991,"type":"integer"},"url":{"description":"HTTPS URL of an audio file for xAI to fetch server-side. Mutually exclusive with `file`. No gateway-side size limit applies.","format":"uri","type":"string"}},"type":"object"}