Below is an example of how to use the text-to-speech API in C++. After having created an AILIAVoice instance, and opened the model with ailiaVoiceOpenModelFile, use ailiaVoiceGraphemeToPhoneme to convert the text to phonemes, then use ailiaVoiceInference to perform the text-to-speech conversion, after which it is possible to get the resulting audio waveforms with ailiaVoiceGetWave. When using GPS-SoVITS, provide a reference audio file with ailiaVoiceSetReference before using ailiaVoiceInference.
#include "ailia_voice_util.h"
#include <stdio.h>
#include <vector>
#include <string>
#include <string.h>
#include "wave_reader.h"
#include "wave_writer.h"
int main(int argc, char *argv[]){
printf("Usage : ailia_voice_sample [tacotron2/gpt-sovits/gpt-sovits-en] [input_text]\n");
const char * input_text = "";
const char * lang = "";
const char * model = "tacotron2";
if (argc >= 2){
model = argv[1];
if (!(strcmp(model, "tacotron2") == 0 || strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-en") == 0)){
printf("model must be tacotron2 or gpt-sovits\n");
return -1;
}
}
if (argc >= 3){
input_text = argv[2];
}
if (strcmp(model, "tacotron2") == 0 || strcmp(model, "gpt-sovits-en") == 0 ){
if (strlen(input_text) == 0){
input_text = u8"Hello world.";
}
lang = "en";
}else{
if (strlen(input_text) == 0){
input_text = u8"こんにちは。今日は新しいAIエンジンであるアイリアSDKを紹介します。";
}
lang = "ja";
}
printf("Model : %s\n", model);
printf("Input text : %s\n", input_text);
printf("Language : %s\n", lang);
AILIAVoice *net;
int env_id = AILIA_ENVIRONMENT_ID_AUTO;
int num_thread = AILIA_MULTITHREAD_AUTO;
int memory_mode = AILIA_MEMORY_REDUCE_CONSTANT | AILIA_MEMORY_REDUCE_CONSTANT_WITH_INPUT_INITIALIZER | AILIA_MEMORY_REUSE_INTERSTAGE;
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceCreate error %d\n", status);
return -1;
}
if (strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-en") == 0){
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA error %d\n", status);
return -1;
}
}
if (strcmp(model, "gpt-sovits-en") == 0){
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA error %d\n", status);
return -1;
}
}
if (strcmp(model, "tacotron2") == 0){
}else{
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenModelFileA error %d\n", status);
return -1;
}
if (strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-en") == 0){
int sampleRate, nChannels, nSamples;
const char *ref_audio = "../onnx/gpt-sovits/reference_audio_girl.wav";
std::vector<float> wave = read_wave_file(ref_audio, &sampleRate, &nChannels, &nSamples);
const char *ref_text = "水をマレーシアから買わなくてはならない。";
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGraphemeToPhoneme error %d\n", status);
return -1;
}
unsigned int len = 0;
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatureLength error %d\n", status);
return -1;
}
std::vector<char> ref_features;
ref_features.resize(len);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatures error %d\n", status);
return -1;
}
printf("Reference Features : %s\n", &ref_features[0]);
status =
ailiaVoiceSetReference(net, &wave[0], wave.size() *
sizeof(
float), nChannels, sampleRate, &ref_features[0]);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceSetReference error %d\n", status);
return -1;
}
}
std::vector<char> features;
if (strcmp(model, "tacotron2") == 0){
}else{
if (strcmp(model, "gpt-sovits") == 0){
}else{
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGraphemeToPhoneme error %d\n", status);
return -1;
}
unsigned int len = 0;
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatureLength error %d\n", status);
return -1;
}
features.resize(len);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatures error %d\n", status);
return -1;
}
printf("Features : %s\n", &features[0]);
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceInference error %d\n", status);
return -1;
}
unsigned int samples, channels, sampling_rate;
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetWaveInfo error %d\n", status);
return -1;
}
std::vector<float> buf(samples * channels);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetWave error %d\n", status);
return -1;
}
printf("Wave samples : %d\nWave channles : %d\nWave sampling rate : %d\n", samples, channels, sampling_rate);
write_wave_file("output.wav", buf, sampling_rate);
return 0;
}
#define AILIA_VOICE_G2P_TYPE_GPT_SOVITS_JA
GPT SOVITS Japanese.
Definition: ailia_voice.h:143
int AILIA_API ailiaVoiceGraphemeToPhoneme(struct AILIAVoice *net, const char *utf8, int g2p_type)
Perform g2p.
int AILIA_API ailiaVoiceSetReference(struct AILIAVoice *net, float *buf, unsigned int buf_size, unsigned int channels, unsigned int sampling_rate, const char *features)
Set the waveform and text as references for zero-shot voice synthesis.
#define AILIA_VOICE_DICTIONARY_TYPE_OPEN_JTALK
Format for OpenJTalk.
Definition: ailia_voice.h:43
#define AILIA_VOICE_G2P_TYPE_GPT_SOVITS_EN
GPT SOVITS English.
Definition: ailia_voice.h:132
int AILIA_API ailiaVoiceInference(struct AILIAVoice *net, const char *utf8)
Perform inference.
int AILIA_API ailiaVoiceGetFeatureLength(struct AILIAVoice *net, unsigned int *len)
Gets the size of features. (Include null)
int AILIA_API ailiaVoiceGetWave(struct AILIAVoice *net, float *buf, unsigned int buf_size)
Gets the decoded features.
int AILIA_API ailiaVoiceGetWaveInfo(struct AILIAVoice *net, unsigned int *samples, unsigned int *channels, unsigned int *sampling_rate)
Gets the information of wave.
int AILIA_API ailiaVoiceCreate(struct AILIAVoice **net, int env_id, int num_thread, int memory_mode, int flags, AILIAVoiceApiCallback callback, int version)
Creates a Voice instance.
#define AILIA_VOICE_FLAG_NONE
Default flag.
Definition: ailia_voice.h:117
#define AILIA_VOICE_MODEL_TYPE_TACOTRON2
Format for Tacotron2.
Definition: ailia_voice.h:69
int AILIA_API ailiaVoiceGetFeatures(struct AILIAVoice *net, char *features, unsigned int len)
Gets the decoded features.
int AILIA_API ailiaVoiceOpenDictionaryFileA(struct AILIAVoice *net, const char *dictionary_path, int dictionary_type)
Set dictionary into a network instance.
#define AILIA_VOICE_DICTIONARY_TYPE_G2P_EN
Format for G2P_EN.
Definition: ailia_voice.h:54
#define AILIA_VOICE_API_CALLBACK_VERSION
Struct version.
Definition: ailia_voice.h:191
#define AILIA_VOICE_MODEL_TYPE_GPT_SOVITS
Format for GPT-SoVITS.
Definition: ailia_voice.h:80
int AILIA_API ailiaVoiceOpenModelFileA(struct AILIAVoice *net, const char *encoder, const char *decoder1, const char *decoder2, const char *wave, const char *ssl, int model_type, int cleaner_type)
Set models into a network instance.
void AILIA_API ailiaVoiceDestroy(struct AILIAVoice *net)
It destroys the Voice instance.
#define AILIA_VOICE_CLEANER_TYPE_BASIC
BasicCleaner.
Definition: ailia_voice.h:91
Definition: ailia_voice.h:194
The userdic.dic created with pyopenjtalk can be loaded by executing the ailiaVoiceSetUserDictionaryFile API before the ailiaVoiceOpenDictionaryFile API.