Firstborn/Assets/Scripts/SpeechBlend.cs

284 lines
10 KiB
C#
Raw Normal View History

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEditor;
using SpeechBlendEngine;
using TMPro;
public class SpeechBlend : MonoBehaviour
{
public AudioSource voiceAudioSource;
public List<SkinnedMeshRenderer> FaceMeshes;
[HideInInspector]
public bool showBlendShapeMenu = false;
[HideInInspector]
public SpeechUtil.VisemeBlendshapeIndexes faceBlendshapes;
[HideInInspector]
public SpeechUtil.VisemeWeight visemeWeightTuning;
[Header("Settings")]
[Space(10)]
[Tooltip("Toggle lipsyncing")]
public bool lipsyncActive = true; // Toggle lipsyncing
[Tooltip("Select whether visemes are used")]
public SpeechUtil.Mode trackingMode = SpeechUtil.Mode.jawOnly; // Select whether visemes are used
#if UNITY_WEBGL
public float WebGLVolumeAdjust = 200;
#endif
[Tooltip("Amplitude of jaw movement")]
[Range(0.0f, 1.0f)]
public float jawMovementAmount = 0.5f; // Amplitude of jaw movement
[Tooltip("Jaw motion speed")]
[Range(0.0f, 1.0f)]
public float jawMovementSpeed = 0.5f; // Jaw motion speed
[Tooltip("Amplitude of lip movement")]
[Range(0.0f, 1.0f)]
public float lipsBlendshapeMovementAmount = 0.5f; // Amplitude of lip movement
[Tooltip("Lip viseme movement speed")]
[Range(0.0f, 1.0f)]
public float lipsBlendshapeChangeSpeed = 0.5f; // Lip viseme movement speed
[Tooltip("Number of calculations to use.")]
public SpeechUtil.Accuracy accuracy = SpeechUtil.Accuracy.Medium; // Number of calculations to use. Cannot be changed at runtime
[Tooltip("Number of frames to wait before next calculation (higher number uses less resources but reponds slower)")]
[Range(1, 10)]
public int framesPerUpdate = 1; // Number of frames to wait before next calculation (higher number uses less resources but reponds slower)
[Tooltip("Ignore distance between AudioSource and AudioListener when accounting for volume.")]
public bool volumeEqualization = false;
[Tooltip("Voice type of character")]
public VoiceProfile.VoiceType voiceType = VoiceProfile.VoiceType.female; // Voice type of character
[Tooltip("Jaw joint for when not using a mouth open blendshape")]
public Transform jawJoint; // Jaw joint for when not using a mouth open blendshape
[Tooltip("Direction adjust for jaw opening")]
public Vector3 jawOpenDirection = new Vector3(1, 0, 0); // Direction adjust for jaw joint opening
[Tooltip("Angular offset for jaw joint opening")]
public Vector3 jawJointOffset; // Angular offset for jaw joint opening
[Tooltip("Blendshape template for visemes shapes. (default: DAZ)")]
public VoiceProfile.VisemeBlendshapeTemplate shapeTemplate; // Blendshape template for visemes shapes. (default: DAZ)
[HideInInspector]
public AudioListener activeListener; // Source audiolistener for use when calculating the volume equalization
float bs_volume_scaling = 20f;
float jaw_volume_scaling = 20f;
int f_low;
int f_high;
float fres;
float[,] extractor; // Speech extractor model
float[,] transformer; // Speech data transformer
float[] modifier; // Speech data modifier
float[] bs_setpoint;
float[] bs_setpoint_last;
float[,] cmem;
float bs_mouthOpen_setpoint;
Quaternion trans_mouthOpen_setpoint;
Quaternion trans_mouthOpen_rest;
float current_volume;
[HideInInspector]
public VoiceProfile.VisemeBlendshapeTemplate template_saved = VoiceProfile.VisemeBlendshapeTemplate.DAZ;
[HideInInspector]
public SpeechUtil.VisemeBlendshapeIndexes faceBlendshapes_saved = new SpeechUtil.VisemeBlendshapeIndexes(VoiceProfile.G2_template);
[HideInInspector]
public SpeechUtil.VisemeWeight visemeWeightTuning_saved = new SpeechUtil.VisemeWeight(VoiceProfile.G2_template);
float jaw_CSF = 1;
float bs_CSF = 1;
int updateFrame = 0;
SpeechUtil.Accuracy accuracy_last;
bool[] blendshapeInfluenceActive;
void Start() {
}
public void IsMale(bool TF) {
if (TF) {
voiceType = VoiceProfile.VoiceType.male;
} else {
voiceType = VoiceProfile.VoiceType.female;
}
}
public void Talk(AudioClip A) {
Debug.Log("Talking");
voiceAudioSource.clip = A;
voiceAudioSource.Play();
bs_setpoint = new float[faceBlendshapes.template.Nvis];
bs_setpoint_last = new float[faceBlendshapes.template.Nvis];
if (jawJoint != null)
{
trans_mouthOpen_setpoint = jawJoint.localRotation;
}
trans_mouthOpen_rest = trans_mouthOpen_setpoint;
bs_mouthOpen_setpoint = 0;
accuracy_last = accuracy;
fres = ExtractFeatures.CalculateFres();
UpdateExtractor();
if (jawJoint == null & !faceBlendshapes.AnyAssigned())
{
print("Warning (SpeechBlend): Neither jaw joint or face blendshapes have been assigned");
lipsyncActive = false;
}
if (trackingMode.Equals(SpeechUtil.Mode.jawAndVisemes) & faceBlendshapes.JawOnly())
{
print("Warning (SpeechBlend): No viseme blendshapes detected, jaw-only mode enabled.");
trackingMode = SpeechUtil.Mode.jawOnly;
}
else if (trackingMode.Equals(SpeechUtil.Mode.jawAndVisemes))
{
blendshapeInfluenceActive = new bool[faceBlendshapes.template.Nvis];
}
}
int printIndex = 200;
void FixedUpdate()
{
if (voiceAudioSource.isPlaying & lipsyncActive)
{
bs_volume_scaling = 500f * Mathf.Exp(-6.111f * lipsBlendshapeMovementAmount);
jaw_volume_scaling = 500f * Mathf.Exp(-6.111f * jawMovementAmount);
if (++updateFrame >= framesPerUpdate)
{
updateFrame = 0;
float last_volume = current_volume;
current_volume = 0;
#if UNITY_WEBGL
int no_samples = SpeechBlend_WEBGL_AudioSpectrum.InitializeBytes(4096); // Get volume data
current_volume = SpeechBlend_WEBGL_AudioSpectrum.GetSpeechVolume();
current_volume = current_volume * WebGLVolumeAdjust;
#else
float[] audioTrace = new float[256];
voiceAudioSource.GetOutputData(audioTrace, 0);
for (int i = 0; i < 256; i++)
current_volume += Mathf.Abs(audioTrace[i]);
#endif
if (volumeEqualization)
current_volume = ExtractFeatures.EqualizeDistance(current_volume, voiceAudioSource, activeListener);
current_volume = last_volume * (1 - lipsBlendshapeChangeSpeed) + current_volume * lipsBlendshapeChangeSpeed;
// Calculate jaw open amount
bs_mouthOpen_setpoint = 100 * current_volume / jaw_volume_scaling * .1f * (1 / jaw_CSF);
if (jawJoint != null)
{
//current_volume = 0;
trans_mouthOpen_setpoint = Quaternion.Euler(jawJointOffset + trans_mouthOpen_rest.eulerAngles * (1 - jawMovementAmount * 3) + (trans_mouthOpen_rest.eulerAngles + jawOpenDirection * current_volume / jaw_volume_scaling) * jawMovementAmount * 3);
}
// Calculate viseme amounts
if (trackingMode == SpeechUtil.Mode.jawAndVisemes)
{
f_low = Mathf.RoundToInt(ExtractFeatures.getlf(accuracy) / fres);
f_high = Mathf.RoundToInt(ExtractFeatures.gethf(accuracy) / fres);
if (accuracy_last != accuracy)
UpdateExtractor();
accuracy_last = accuracy;
#if UNITY_WEBGL
float[] rawData = new float[4096];
SpeechBlend_WEBGL_AudioSpectrum.GetFrequencySpectrum(rawData); // get the spectrum data
for (int i = 0; i < rawData.Length; i++)
rawData[i] += 1e-10f;
#else
float[] rawData = ExtractFeatures.GetSoundData(voiceAudioSource);
#endif
float[] features = ExtractFeatures.ExtractSample(rawData, extractor, transformer, modifier, ref cmem, f_low, f_high, accuracy);
ExtractFeatures.FeatureOutput W = ExtractFeatures.Evaluate(features, voiceType, accuracy);
float[] influences = new float[ExtractFeatures.no_visemes];
for (int i = 0; i < W.size; i++)
{
for (int j = 0; j < ExtractFeatures.no_visemes; j++)
influences[j] += VoiceProfile.Influence(voiceType, W.reg[i], j, accuracy) * W.w[i];
}
float[] influences_template = VoiceProfile.InfluenceTemplateTransform(influences, shapeTemplate);
blendshapeInfluenceActive = new bool[faceBlendshapes.template.Nvis];
for (int i = 0; i < faceBlendshapes.template.Nvis; i++)
{
float visemeWeight = visemeWeightTuning.GetByIndex(i);
influences_template[i] *= visemeWeight;
if (visemeWeight < 1e-2)
blendshapeInfluenceActive[i] = false;
else
blendshapeInfluenceActive[i] = true;
}
for (int i = 0; i < faceBlendshapes.template.Nvis; i++)
{
bs_setpoint[i] = influences_template[i] * 100 * current_volume / bs_volume_scaling;
}
jaw_CSF = VoiceProfile.Influence(voiceType, W.reg[0], ExtractFeatures.no_visemes, accuracy);
bs_CSF = VoiceProfile.Influence(voiceType, W.reg[0], ExtractFeatures.no_visemes, accuracy);
bs_mouthOpen_setpoint /= VoiceProfile.Influence(voiceType, W.reg[0], ExtractFeatures.no_visemes, accuracy);
}
}
}
}
private void LateUpdate()
{
if (voiceAudioSource.clip != null) {
// Update jaw joint animation
if (!faceBlendshapes.BlendshapeAssigned("mouthOpen"))
{
if (jawJoint != null)
{
float moveSpeed = 2.5f * Mathf.Exp(3.658f * jawMovementSpeed);
jawJoint.transform.localRotation = Quaternion.Lerp(jawJoint.transform.localRotation, trans_mouthOpen_setpoint, Time.deltaTime * moveSpeed);
}
}
// Update Facial Blendshapes
UpdateBlendshapes();
}
}
void UpdateBlendshapes()
{
if (trackingMode == SpeechUtil.Mode.jawAndVisemes)
{
for (int i = 0; i < faceBlendshapes.template.Nvis; i++)
{
if (faceBlendshapes.BlendshapeAssigned(i) & blendshapeInfluenceActive[i])
{
float currentVisemeValue = bs_setpoint_last[i];
for (int j = 0; j < FaceMeshes.Count; j++) {
FaceMeshes[j].SetBlendShapeWeight(faceBlendshapes.GetByIndex(i), currentVisemeValue * (1 - lipsBlendshapeChangeSpeed * bs_CSF) + bs_setpoint[i] * lipsBlendshapeChangeSpeed * bs_CSF);
bs_setpoint_last[i] = FaceMeshes[j].GetBlendShapeWeight(faceBlendshapes.GetByIndex(i));
}
}
}
}
if (faceBlendshapes.BlendshapeAssigned("mouthOpen"))
{
for (int j = 0; j < FaceMeshes.Count; j++) {
float currentValue = FaceMeshes[j].GetBlendShapeWeight(faceBlendshapes.mouthOpenIndex);
FaceMeshes[j].SetBlendShapeWeight(faceBlendshapes.mouthOpenIndex, currentValue * (1 - (jawMovementSpeed * jaw_CSF)) + bs_mouthOpen_setpoint * (jawMovementSpeed * jaw_CSF));
}
}
}
public void UpdateExtractor()
{
extractor = ExtractFeatures.BuildExtractor(fres, ExtractFeatures.getlf(accuracy), ExtractFeatures.gethf(accuracy), accuracy);
cmem = new float[ExtractFeatures.getC(accuracy) + 1, 2];
modifier = ExtractFeatures.CreateCC_lifter(accuracy);
transformer = ExtractFeatures.GenerateTransformer(accuracy);
f_low = Mathf.RoundToInt(ExtractFeatures.getlf(accuracy) / fres);
f_high = Mathf.RoundToInt(ExtractFeatures.gethf(accuracy) / fres);
}
}