STT 모듈 내용 정리 (네이버, 구글, 마이크로 소프트)
1. Naver STT
- 안드로이드, 아이폰만 가능함. Rest 방식이 아님. PC 불가능
2. Google STT
- C# 등 지원하나, IOS 지원 없음.
- REST 지원하나, 이전에 속도가 느리다는 피드백이 있었음 (Jason님)
- C# 을 통해 Unity에서 사용 가능한지 검증 필요 (안될 가능성 99%)
- 이를 사용하기 위해선 REST를 사용하거나, 서버 구현, 서버와 통신을 통해 STT 활용하는 방식이 되어야 할듯.
3. Microsoft STT
- Unity Asset 있음
- REST 방식도 지원하나, 최대 60초
* REST 방식을 사용하면, 발음에 대한 평가가 가능하다. : https://docs.microsoft.com/ko-kr/azure/cognitive-services/speech-service/rest-speech-to-text#pronunciation-assessment-parameters
- 30일 평가판 사용가능
- https://docs.microsoft.com/ko-kr/azure/cognitive-services/speech-service/get-started 참조하여 시작
- 샘플 프로젝트 빌드 관련 설명 : https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/quickstart/csharp/unity/from-microphone
* 동시 접속 요청 필요. 기본 1:1 통신이라, 다수가 사용하면 오류 날 수 있음. 동시 20 까지 확장 가능.
=> https://docs.microsoft.com/ko-kr/azure/cognitive-services/speech-service/faq-stt 에서 동시성 증대 참조
3.1. 개발 내용 (SDK 사용)
- SpeechConfig 개체 생성 (키값, 지역명 사용)
- SpeechRecognizer 개체 생성 (SpeechConfig 및 마이크 사용)
- SpeechRecognizer 이벤트 추가 (인식중, 인식완료, 취소 이벤트)
- 예제 코드
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
using UnityEngine;
using UnityEngine.UI;
using Microsoft.CognitiveServices.Speech;
using System;
using System.Collections;
using Microsoft.CognitiveServices.Speech.Audio;
using System.IO;
#if PLATFORM_ANDROID
using UnityEngine.Android;
#endif
#if PLATFORM_IOS
using UnityEngine.iOS;
using System.Collections;
#endif
public class HelloWorld : MonoBehaviour
{
private bool micPermissionGranted = false;
public Text outputText;
public Button recoButton;
SpeechRecognizer recognizer;
SpeechConfig config;
AudioConfig audioInput;
PushAudioInputStream pushStream;
private object threadLocker = new object();
private bool recognitionStarted = false;
private string message;
int lastSample = 0;
AudioSource audioSource;
#if PLATFORM_ANDROID || PLATFORM_IOS
// Required to manifest microphone permission, cf.
// https://docs.unity3d.com/Manual/android-manifest.html
private Microphone mic;
#endif
private byte[] ConvertAudioClipDataToInt16ByteArray(float[] data)
{
MemoryStream dataStream = new MemoryStream();
int x = sizeof(Int16);
Int16 maxValue = Int16.MaxValue;
int i = 0;
while (i < data.Length)
{
dataStream.Write(BitConverter.GetBytes(Convert.ToInt16(data[i] * maxValue)), 0, x);
++i;
}
byte[] bytes = dataStream.ToArray();
dataStream.Dispose();
return bytes;
}
private void RecognizingHandler(object sender, SpeechRecognitionEventArgs e)
{
lock (threadLocker)
{
message = e.Result.Text;
Debug.Log("RecognizingHandler: " + message);
}
}
private void RecognizedHandler(object sender, SpeechRecognitionEventArgs e)
{
lock (threadLocker)
{
message = e.Result.Text;
Debug.Log("RecognizedHandler: " + message);
}
}
private void CanceledHandler(object sender, SpeechRecognitionCanceledEventArgs e)
{
lock (threadLocker)
{
message = e.ErrorDetails.ToString();
Debug.Log("CanceledHandler: " + message);
}
}
public async void ButtonClick()
{
if (recognitionStarted)
{
await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(true);
if (Microphone.IsRecording(Microphone.devices[0]))
{
Debug.Log("Microphone.End: " + Microphone.devices[0]);
Microphone.End(null);
lastSample = 0;
}
lock (threadLocker)
{
recognitionStarted = false;
Debug.Log("RecognitionStarted: " + recognitionStarted.ToString());
}
}
else
{
if (!Microphone.IsRecording(Microphone.devices[0]))
{
Debug.Log("Microphone.Start: " + Microphone.devices[0]);
audioSource.clip = Microphone.Start(Microphone.devices[0], true, 200, 16000);
Debug.Log("audioSource.clip channels: " + audioSource.clip.channels);
Debug.Log("audioSource.clip frequency: " + audioSource.clip.frequency);
}
await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
lock (threadLocker)
{
recognitionStarted = true;
Debug.Log("RecognitionStarted: " + recognitionStarted.ToString());
}
}
}
void Start()
{
if (outputText == null)
{
UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
}
else if (recoButton == null)
{
message = "recoButton property is null! Assign a UI Button to it.";
UnityEngine.Debug.LogError(message);
}
else
{
// Continue with normal initialization, Text and Button objects are present.
#if PLATFORM_ANDROID
// Request to use the microphone, cf.
// https://docs.unity3d.com/Manual/android-RequestingPermissions.html
message = "Waiting for mic permission";
if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
{
Permission.RequestUserPermission(Permission.Microphone);
}
#elif PLATFORM_IOS
if (!Application.HasUserAuthorization(UserAuthorization.Microphone))
{
Application.RequestUserAuthorization(UserAuthorization.Microphone);
}
#else
micPermissionGranted = true;
message = "Click button to recognize speech";
#endif
// 키값 및 지역명 입력
config = SpeechConfig.FromSubscription("49b020847c33407197061fee21f33df8", "westus");
pushStream = AudioInputStream.CreatePushStream();
audioInput = AudioConfig.FromStreamInput(pushStream);
recognizer = new SpeechRecognizer(config, audioInput);
recognizer.Recognizing += RecognizingHandler;
recognizer.Recognized += RecognizedHandler;
recognizer.Canceled += CanceledHandler;
recoButton.onClick.AddListener(ButtonClick);
foreach (var device in Microphone.devices)
{
Debug.Log("DeviceName: " + device);
}
audioSource = GameObject.Find("MyAudioSource").GetComponent<AudioSource>();
}
}
void Disable()
{
recognizer.Recognizing -= RecognizingHandler;
recognizer.Recognized -= RecognizedHandler;
recognizer.Canceled -= CanceledHandler;
pushStream.Close();
recognizer.Dispose();
}
void FixedUpdate()
{
#if PLATFORM_ANDROID
if (!micPermissionGranted && Permission.HasUserAuthorizedPermission(Permission.Microphone))
{
micPermissionGranted = true;
message = "Click button to recognize speech";
}
#elif PLATFORM_IOS
if (!micPermissionGranted && Application.HasUserAuthorization(UserAuthorization.Microphone))
{
micPermissionGranted = true;
message = "Click button to recognize speech";
}
#endif
lock (threadLocker)
{
if (recoButton != null)
{
recoButton.interactable = micPermissionGranted;
}
if (outputText != null)
{
outputText.text = message;
}
}
if (Microphone.IsRecording(Microphone.devices[0]) && recognitionStarted == true)
{
GameObject.Find("MyButton").GetComponentInChildren<Text>().text = "Stop";
int pos = Microphone.GetPosition(Microphone.devices[0]);
int diff = pos - lastSample;
if (diff > 0)
{
float[] samples = new float[diff * audioSource.clip.channels];
audioSource.clip.GetData(samples, lastSample);
byte[] ba = ConvertAudioClipDataToInt16ByteArray(samples);
if (ba.Length != 0)
{
Debug.Log("pushStream.Write pos:" + Microphone.GetPosition(Microphone.devices[0]).ToString() + " length: " + ba.Length.ToString());
pushStream.Write(ba);
}
}
lastSample = pos;
}
else if (!Microphone.IsRecording(Microphone.devices[0]) && recognitionStarted == false)
{
GameObject.Find("MyButton").GetComponentInChildren<Text>().text = "Start";
}
}
}
https://mintpot.synology.me:30000/plugin_assets/redmine_ckeditor/ckeditor/plugins/widget/images/handle.png);display:none;"> 3.2. PC 빌드
- 빌드 후 프로젝트명_Data 폴더 -> Plugins -> Microsoft.CognitiveServices.Speech.core.dll 파일 복사 -> 프로젝트명_Data 폴더 -> Managed 폴더에 붙여넣기 필요
3.3. 안드로이드 빌드
- 일반 안드로이드 빌드 세팅을 따름. 권한은 마이크 권한 필수.
- Scripting Backend를 IL2CPP로 하지 않고 Mono로 해도 됨.
- Runtime Version : .NET 4.x
3.4. 개발 내용 (REST API)
- 녹음된 음성파일을 전달한다.
- 발음 정확도 등 표시 가능. : https://docs.microsoft.com/ko-kr/azure/cognitive-services/speech-service/rest-speech-to-text#pronunciation-assessment-parameters (응답 매개변수참조)
- 녹음파일 길이가 길수록 오래걸림
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
using UnityEngine;
using UnityEngine.Networking;
using UnityEngine.UI;
public class CertiClass : CertificateHandler
{
protected override bool ValidateCertificate(byte[] certificateData)
{
return true;
}
}
public class TestScript : MonoBehaviour
{
private string m_authKey;
private SavWav m_saveWav;
public AudioSource m_audioSource;
private void OnEnable()
{
m_saveWav = new SavWav();
}
public void OnClick()
{
StartCoroutine(CoConnectServer(""));
}
public void MicStart()
{
if (Microphone.IsRecording(Microphone.devices[0]))
{
Debug.Log("Microphone.End: " + Microphone.devices[0]);
Microphone.End(null);
}
else
{
m_audioSource.clip = Microphone.Start(Microphone.devices[0], true, 200, 16000);
}
}
public void MicStop()
{
float[] temp = new float[m_audioSource.clip.samples];
Microphone.End(null);
if (File.Exists(Path.Combine(Application.streamingAssetsPath, "temp.wav")))
File.Delete(Path.Combine(Application.streamingAssetsPath, "temp.wav"));
m_saveWav.Save(Path.Combine(Application.streamingAssetsPath, "temp.wav"), m_audioSource.clip);
StartCoroutine(CoConnectServer(""));
}
protected IEnumerator CoConnectServer(string text)
{
UnityWebRequest request = null;
try
{
string path = Path.Combine(Application.streamingAssetsPath, "temp.wav");
List temp = new List();
using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read))
{
// Open a request stream and write 1024 byte chunks in the stream one at a time.
byte[] buffer = new Byte[fs.Length];
fs.Read(buffer, 0, buffer.Length);
temp.AddRange(buffer);
fs.Close();
}
string url = string.Format("https://westus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{0}", "language=en-US&format=detailed");
request = UnityWebRequest.Put(url, temp.ToArray());
request.SetRequestHeader("Content-Type", "audio/wav; codecs=audio/pcm; samplerate=16000");
request.SetRequestHeader("Ocp-Apim-Subscription-Key", "49b020847c33407197061fee21f33df8");
request.SetRequestHeader("Accept", "application/json;text/xml");
var pronAssessmentParamsJson = $"____preserved_1____";
var pronAssessmentParamsBytes = Encoding.UTF8.GetBytes(pronAssessmentParamsJson);
var pronAssessmentHeader = Convert.ToBase64String(pronAssessmentParamsBytes);
request.SetRequestHeader("Pronunciation-Assessment", pronAssessmentHeader);
request.chunkedTransfer = true;
//request.SetRequestHeader("X-NCP-APIGW-API-KEY-ID", MintDefaultOptionsEditorData.Instance.NaverTTSClientID);
//request.SetRequestHeader("X-NCP-APIGW-API-KEY", MintDefaultOptionsEditorData.Instance.NaverTTSClientSecretKey);
request.timeout = 100;
request.method = "POST";
request.certificateHandler = new CertiClass();
}
catch (Exception ex)
{
Debug.Log(ex.Message);
throw ex;
}
yield return request.SendWebRequest();
if (string.IsNullOrEmpty(request.error))
{
Debug.Log(request.downloadHandler.text);
}
else
{
Debug.Log(request.error);
}
}
}
------------- SavWav 라는 마이크 녹음 저장 모듈 ---------------
using System;
using System.IO;
using UnityEngine;
using System.Collections.Generic;
using System.Threading;
public class SavWav
{
const int HEADER_SIZE = 44;
struct ClipData
{
public int samples;
public int channels;
public float[] samplesData;
}
public bool Save(string filename, AudioClip clip)
{
if (!filename.ToLower().EndsWith(".wav"))
{
filename += ".wav";
}
var filepath = filename;
Debug.Log(filepath);
clip = TrimSilence(clip, 0);
// Make sure directory exists if user is saving to sub dir.
Directory.CreateDirectory(Path.GetDirectoryName(filepath));
ClipData clipdata = new ClipData();
clipdata.samples = clip.samples;
clipdata.channels = clip.channels;
float[] dataFloat = new float[clip.samples * clip.channels];
clip.GetData(dataFloat, 0);
clipdata.samplesData = dataFloat;
using (var fileStream = CreateEmpty(filepath))
{
MemoryStream memstrm = new MemoryStream();
ConvertAndWrite(memstrm, clipdata);
memstrm.WriteTo(fileStream);
WriteHeader(fileStream, clip);
}
return true; // TODO: return false if there's a failure saving the file
}
public AudioClip TrimSilence(AudioClip clip, float min)
{
var samples = new float[clip.samples];
clip.GetData(samples, 0);
return TrimSilence(new List<float>(samples), min, clip.channels, clip.frequency);
}
public AudioClip TrimSilence(List<float> samples, float min, int channels, int hz)
{
return TrimSilence(samples, min, channels, hz, false, false);
}
public AudioClip TrimSilence(List<float> samples, float min, int channels, int hz, bool _3D, bool stream)
{
int i;
for (i = 0; i < samples.Count; i++)
{
if (Mathf.Abs(samples[i]) > min)
{
break;
}
}
samples.RemoveRange(0, i);
for (i = samples.Count - 1; i > 0; i--)
{
if (Mathf.Abs(samples[i]) > min)
{
break;
}
}
samples.RemoveRange(i, samples.Count - i);
var clip = AudioClip.Create("TempClip", samples.Count, channels, hz, _3D, stream);
clip.SetData(samples.ToArray(), 0);
return clip;
}
FileStream CreateEmpty(string filepath)
{
var fileStream = new FileStream(filepath, FileMode.Create);
byte emptyByte = new byte();
for (int i = 0; i < HEADER_SIZE; i++) //preparing the header
{
fileStream.WriteByte(emptyByte);
}
return fileStream;
}
void ConvertAndWrite(MemoryStream memStream, ClipData clipData)
{
float[] samples = new float[clipData.samples * clipData.channels];
samples = clipData.samplesData;
Int16[] intData = new Int16[samples.Length];
Byte[] bytesData = new Byte[samples.Length * 2];
const float rescaleFactor = 32767; //to convert float to Int16
for (int i = 0; i < samples.Length; i++)
{
intData[i] = (short)(samples[i] * rescaleFactor);
//Debug.Log (samples [i]);
}
Buffer.BlockCopy(intData, 0, bytesData, 0, bytesData.Length);
memStream.Write(bytesData, 0, bytesData.Length);
}
void WriteHeader(FileStream fileStream, AudioClip clip)
{
var hz = clip.frequency;
var channels = clip.channels;
var samples = clip.samples;
fileStream.Seek(0, SeekOrigin.Begin);
Byte[] riff = System.Text.Encoding.UTF8.GetBytes("RIFF");
fileStream.Write(riff, 0, 4);
Byte[] chunkSize = BitConverter.GetBytes(fileStream.Length - 8);
fileStream.Write(chunkSize, 0, 4);
Byte[] wave = System.Text.Encoding.UTF8.GetBytes("WAVE");
fileStream.Write(wave, 0, 4);
Byte[] fmt = System.Text.Encoding.UTF8.GetBytes("fmt ");
fileStream.Write(fmt, 0, 4);
Byte[] subChunk1 = BitConverter.GetBytes(16);
fileStream.Write(subChunk1, 0, 4);
UInt16 two = 2;
UInt16 one = 1;
Byte[] audioFormat = BitConverter.GetBytes(one);
fileStream.Write(audioFormat, 0, 2);
Byte[] numChannels = BitConverter.GetBytes(channels);
fileStream.Write(numChannels, 0, 2);
Byte[] sampleRate = BitConverter.GetBytes(hz);
fileStream.Write(sampleRate, 0, 4);
Byte[] byteRate = BitConverter.GetBytes(hz * channels * 2); // sampleRate * bytesPerSample*number of channels, here 44100*2*2
fileStream.Write(byteRate, 0, 4);
UInt16 blockAlign = (ushort)(channels * 2);
fileStream.Write(BitConverter.GetBytes(blockAlign), 0, 2);
UInt16 bps = 16;
Byte[] bitsPerSample = BitConverter.GetBytes(bps);
fileStream.Write(bitsPerSample, 0, 2);
Byte[] datastring = System.Text.Encoding.UTF8.GetBytes("data");
fileStream.Write(datastring, 0, 4);
Byte[] subChunk2 = BitConverter.GetBytes(samples * 2);
fileStream.Write(subChunk2, 0, 4);
fileStream.Close();
}
}
https://mintpot.synology.me:30000/plugin_assets/redmine_ckeditor/ckeditor/plugins/widget/images/handle.png);display:none;">
3.4. 응답 메세지 구조
4. Asset
- https://assetstore.unity.com/packages/add-ons/machinelearning/google-cloud-speech-recognition-vr-ar-mobile-desktop-pro-72625
- 안드로이드, ios, windows 다 지원하는듯 하며, REST를 사용하는것으로 예상됨. (근데 속도가 늦지 않는듯..?)
- Google STT를 사용하며, 현재는 Streaming을 지원하지 않음.
* 1. 어셋 구입하여 테스트
2. Jason님 구글 서버와 연동 가능한 코드를 유니티에서 구현