{"$schema":"https://json-schema.org/draft/2020-12/schema","type":"object","properties":{"text":{"description":"The text to be synthesized into speech. Maximum input of 2,000 characters.","type":"string","maxLength":2000},"voice_id":{"description":"The ID of the voice to use for synthesizing speech. Defaults to Dennis.","default":"Dennis","type":"string","enum":["Loretta","Darlene","Marlene","Hank","Evelyn","Celeste","Pippa","Tessa","Liam","Callum","Hamish","Abby","Graham","Rupert","Mortimer","Snik","Anjali","Saanvi","Arjun","Claire","Oliver","Simon","Elliot","James","Serena","Gareth","Vinny","Lauren","Jessica","Ethan","Tyler","Jason","Chloe","Veronica","Victoria","Miranda","Sebastian","Victor","Malcolm","Nate","Brian","Amina","Kelsey","Derek","Evan","Kayla","Jake","Grant","Tristan","Nadia","Selene","Marcus","Riley","Damon","Cedric","Mia","Naomi","Jonah","Levi","Avery","Brandon","Conrad","Bianca","Lucian","Trevor","Alex","Ashley","Craig","Deborah","Dennis","Edward","Elizabeth","Hades","Julia","Pixie","Mark","Olivia","Priya","Ronald","Sarah","Shaun","Theodore","Timothy","Wendy","Dominus","Hana","Clive","Carter","Blake","Luna","Reed","Duncan","Felix","Eleanor","Sophie"]},"output_format":{"description":"The output format for the audio. Supported formats are mp3, opus, wav, and flac. Defaults to mp3.","default":"mp3","type":"string","enum":["mp3","opus","wav","flac"]},"bit_rate":{"description":"Bits per second of the audio. Only for compressed audio formats (mp3, opus). The default is 128,000.","type":"integer","minimum":-9007199254740991,"maximum":9007199254740991},"sample_rate":{"description":"The synthesis sample rate in hertz. Accepts: 8000, 16000, 22050, 24000, 32000, 44100, 48000. The default is 48,000.","type":"integer","minimum":-9007199254740991,"maximum":9007199254740991},"speaking_rate":{"description":"Speaking rate/speed, in the range [0.5, 1.5]. The default is 1.0. We recommend using values above 0.8 to ensure high quality.","type":"number","minimum":0.5,"maximum":1.5},"temperature":{"description":"Determines the degree of randomness when sampling audio tokens. Defaults to 1.0. Accepts values between 0 (exclusive) and 2 (inclusive). Higher values = more expressive, lower values = more deterministic.","default":1,"type":"number","minimum":0.01,"maximum":2},"timestamp_type":{"description":"Controls timestamp metadata returned with the audio. \"word\" returns word-level timing, \"character\" returns character-level timing. Note: adds latency. Defaults to none.","default":"none","type":"string","enum":["none","word","character"]},"apply_text_normalization":{"description":"When enabled, text normalization expands numbers, dates, times, and abbreviations before converting to speech. Turning this off may reduce latency.","type":"boolean"}},"required":["text","voice_id","output_format","temperature","timestamp_type"],"additionalProperties":false}