UnityPaul commited on
Commit
7754914
1 Parent(s): fb3e800

Upload 6 files

Browse files
AudioDecoder_Tiny.sentis CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6d24553eda46f335ead8ba30e3970fc8056086a538047248821aa31a135f938
3
- size 198832845
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e213397b356d02117ba9489a717c9ff1402175c55ab8882800affa595079768a
3
+ size 198748952
AudioEncoder_Tiny.sentis CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3fb532b04b438079db8de9551a0d813da22be5fd05cdeeff3d09794492ca5b1
3
- size 32888514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7da4d76dcbd84659f22e744a89ef7916a75a873415fac953459384ee7d4b457
3
+ size 32860344
LogMelSepctro.sentis CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e021007141fdf2d39113ea1aa12bc258226ea1c2976171544f3a05979e2b69ef
3
- size 1360848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f1d5bf692d1e8bfb225b493386614f16f1a7c71bc68a5d3106b79793640a8ab
3
+ size 1353668
README.md CHANGED
@@ -4,14 +4,14 @@ library_name: unity-sentis
4
  pipeline_tag: automatic-speech-recognition
5
  ---
6
 
7
- # Whisper-Tiny model in Unity Sentis (Version 1.3.0-pre.3*)
8
- *Version 1.3.0 Sentis files are not compatible with 1.4.0 and above and need to be recreated
9
 
10
  This is the [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) model tested to work in Unity 2023. It is a speech-to-text model. You feed in a 16kHz wav file and it outputs the best guess for what was said in the audio.
11
 
12
  ## How to Use
13
  * Open a new scene in Unity 2023
14
- * Import package ``com.unity.sentis`` version `1.3.0-pre.3` from the package manager.
15
  * Put the `RunWhisper.cs` on the Main Camera
16
  * Put the *.sentis files and the `vocab.json` in the Assets/StreamingAssets folder
17
  * Add a 16kHz mono audio file up to 30 seconds long to your project and drag on to the audioClip field.
 
4
  pipeline_tag: automatic-speech-recognition
5
  ---
6
 
7
+ # Whisper-Tiny model in Unity Sentis (Version 1.4.0-pre.2*)
8
+ (*Sentis files from 1.3.0 and earlier will not be compatible and would need to be recreated.)
9
 
10
  This is the [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) model tested to work in Unity 2023. It is a speech-to-text model. You feed in a 16kHz wav file and it outputs the best guess for what was said in the audio.
11
 
12
  ## How to Use
13
  * Open a new scene in Unity 2023
14
+ * Import package ``com.unity.sentis`` version `1.4.0-pre.2` from the package manager.
15
  * Put the `RunWhisper.cs` on the Main Camera
16
  * Put the *.sentis files and the `vocab.json` in the Assets/StreamingAssets folder
17
  * Add a 16kHz mono audio file up to 30 seconds long to your project and drag on to the audioClip field.
RunWhisper.cs CHANGED
@@ -49,10 +49,6 @@ public class RunWhisper : MonoBehaviour
49
  const int TRANSLATE = 50358; //for speech-to-text then translate to English
50
  const int NO_TIME_STAMPS = 50363;
51
  const int START_TIME = 50364;
52
-
53
-
54
- Ops ops;
55
- ITensorAllocator allocator;
56
 
57
  int numSamples;
58
  float[] data;
@@ -74,18 +70,22 @@ public class RunWhisper : MonoBehaviour
74
 
75
  void Start()
76
  {
77
- allocator = new TensorCachingAllocator();
78
- ops = WorkerFactory.CreateOps(backend, allocator);
79
 
80
  SetupWhiteSpaceShifts();
81
 
82
  GetTokens();
83
 
84
  Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
 
 
 
 
 
 
85
  Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
86
  Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
87
 
88
- decoderEngine = WorkerFactory.CreateWorker(backend, decoder);
89
  encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
90
  spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
91
 
@@ -116,7 +116,9 @@ public class RunWhisper : MonoBehaviour
116
  return;
117
  }
118
 
119
- data = new float[numSamples];
 
 
120
  audioClip.GetData(data, 0);
121
  }
122
 
@@ -136,10 +138,7 @@ public class RunWhisper : MonoBehaviour
136
  {
137
  using var input = new TensorFloat(new TensorShape(1, numSamples), data);
138
 
139
- // Pad out to 30 seconds at 16khz if necessary
140
- using var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, maxSamples - numSamples });
141
-
142
- spectroEngine.Execute(input30seconds);
143
  var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
144
 
145
  encoderEngine.Execute(spectroOutput);
@@ -156,15 +155,14 @@ public class RunWhisper : MonoBehaviour
156
 
157
  var inputs = new Dictionary<string, Tensor>
158
  {
159
- {"encoded_audio",encodedAudio },
160
- {"tokens" , tokensSoFar }
161
  };
162
 
163
  decoderEngine.Execute(inputs);
164
- var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
165
 
166
- using var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
167
- tokensPredictions.MakeReadable();
168
 
169
  int ID = tokensPredictions[currentToken];
170
 
@@ -225,7 +223,5 @@ public class RunWhisper : MonoBehaviour
225
  decoderEngine?.Dispose();
226
  encoderEngine?.Dispose();
227
  spectroEngine?.Dispose();
228
- ops?.Dispose();
229
- allocator?.Dispose();
230
  }
231
  }
 
49
  const int TRANSLATE = 50358; //for speech-to-text then translate to English
50
  const int NO_TIME_STAMPS = 50363;
51
  const int START_TIME = 50364;
 
 
 
 
52
 
53
  int numSamples;
54
  float[] data;
 
70
 
71
  void Start()
72
  {
 
 
73
 
74
  SetupWhiteSpaceShifts();
75
 
76
  GetTokens();
77
 
78
  Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
79
+
80
+ Model decoderWithArgMax = Functional.Compile(
81
+ (tokens, audio) => Functional.ArgMax(decoder.Forward(tokens, audio)[0], 2),
82
+ (decoder.inputs[0], decoder.inputs[1])
83
+ );
84
+
85
  Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
86
  Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
87
 
88
+ decoderEngine = WorkerFactory.CreateWorker(backend, decoderWithArgMax);
89
  encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
90
  spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
91
 
 
116
  return;
117
  }
118
 
119
+ data = new float[maxSamples];
120
+ numSamples = maxSamples;
121
+ //We will get a warning here if data.length is larger than audio length but that is OK
122
  audioClip.GetData(data, 0);
123
  }
124
 
 
138
  {
139
  using var input = new TensorFloat(new TensorShape(1, numSamples), data);
140
 
141
+ spectroEngine.Execute(input);
 
 
 
142
  var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
143
 
144
  encoderEngine.Execute(spectroOutput);
 
155
 
156
  var inputs = new Dictionary<string, Tensor>
157
  {
158
+ {"input_0", tokensSoFar },
159
+ {"input_1", encodedAudio }
160
  };
161
 
162
  decoderEngine.Execute(inputs);
163
+ var tokensPredictions = decoderEngine.PeekOutput() as TensorInt;
164
 
165
+ tokensPredictions.CompleteOperationsAndDownload();
 
166
 
167
  int ID = tokensPredictions[currentToken];
168
 
 
223
  decoderEngine?.Dispose();
224
  encoderEngine?.Dispose();
225
  spectroEngine?.Dispose();
 
 
226
  }
227
  }
info.json CHANGED
@@ -11,6 +11,6 @@
11
  "vocab.json"
12
  ],
13
  "version" : [
14
- "1.3.0-pre.3"
15
  ]
16
  }
 
11
  "vocab.json"
12
  ],
13
  "version" : [
14
+ "1.4.0"
15
  ]
16
  }