介绍
本教程将指导您使用Hugging Face Unity API在Unity游戏中实现最先进的语音识别。该功能可用于给予指令、与NPC进行对话、提高可访问性或其他需要将口语转换为文本的功能。
要在Unity中尝试语音识别,请查看itch.io上的实时演示。
先决条件
本教程假设您具备Unity的基本知识。它还要求您已安装Hugging Face Unity API。有关设置API的说明,请查看我们之前的博客文章。
步骤
1. 设置场景
在本教程中,我们将设置一个非常简单的场景,玩家可以在其中开始和停止录制,然后将结果转换为文本。
首先创建一个Unity项目,然后创建一个带有四个UI元素的画布:
- 开始按钮:用于开始录制。
- 停止按钮:用于停止录制。
- 文本(TextMeshPro):用于显示语音识别的结果。
2. 设置脚本
创建一个名为SpeechRecognitionTest
的脚本,并将其附加到一个空的游戏对象上。
在脚本中,定义对UI组件的引用:
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
在检查器中分配它们。
然后,使用Start()
方法为开始和停止按钮设置侦听器:
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
此时,您的脚本应该类似于以下内容:
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
private void StartRecording() {
}
private void StopRecording() {
}
}
3. 录制麦克风输入
现在让我们录制麦克风输入并将其编码为WAV格式。首先定义成员变量:
private AudioClip clip;
private byte[] bytes;
private bool recording;
然后,在StartRecording()
中使用Microphone.Start()
方法开始录制:
private void StartRecording() {
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
这将以44100 Hz的采样率录制最多10秒的音频。
如果录制达到了最大长度10秒,我们将希望自动停止录制。为此,在Update()
方法中编写以下代码:
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
然后,在StopRecording()
中截断录音并将其编码为WAV格式:
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
}
最后,我们需要实现EncodeAsWAV()
方法,以准备音频数据供Hugging Face API使用:
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
完整的脚本现在应该看起来像这样:
using System.IO;
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private AudioClip clip;
private byte[] bytes;
private bool recording;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
private void StartRecording() {
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
}
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
}
要测试这段代码是否正常工作,您可以在StopRecording()
方法的末尾添加以下行:
File.WriteAllBytes(Application.dataPath + "/test.wav", bytes);
现在,如果您点击Start
按钮,对着麦克风说话,然后点击Stop
,一个test.wav
文件应该保存在您的Unity Assets文件夹中,其中包含您录制的音频。
4. 语音识别
接下来,我们将使用Hugging Face Unity API对我们编码的音频进行语音识别。为此,我们将创建一个SendRecording()
方法:
using HuggingFace.API;
private void SendRecording() {
HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
text.color = Color.white;
text.text = response;
}, error => {
text.color = Color.red;
text.text = error;
});
}
这将向API发送编码的音频,如果成功,将以白色显示响应,否则将以红色显示错误消息。
不要忘记在StopRecording()
方法的末尾调用SendRecording()
:
private void StopRecording() {
/* 其他代码 */
SendRecording();
}
5. 最后的修饰
最后,让我们稍微改进一下这个演示的用户体验,包括按钮的可交互性和状态消息。
在适当时,即准备好开始/停止录音时,开始和停止按钮应该可交互。
然后,在录音或等待API时,将响应文本设置为简单的状态消息。
完成的脚本应该看起来像这样:
using System.IO;
using HuggingFace.API;
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private AudioClip clip;
private byte[] bytes;
private bool recording;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
stopButton.interactable = false;
}
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
private void StartRecording() {
text.color = Color.white;
text.text = "正在录音...";
startButton.interactable = false;
stopButton.interactable = true;
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
SendRecording();
}
private void SendRecording() {
text.color = Color.yellow;
text.text = "正在发送...";
stopButton.interactable = false;
HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
text.color = Color.white;
text.text = response;
startButton.interactable = true;
}, error => {
text.color = Color.red;
text.text = error;
startButton.interactable = true;
});
}
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
}
恭喜你,现在你可以在Unity中使用最先进的语音识别技术了!
如果你有任何问题或想更深入地了解如何在游戏中使用Hugging Face,请加入Hugging Face的Discord!