File size: 2,380 Bytes
cc2b025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
<!-- livebook:{"app_settings":{"access_type":"public","auto_shutdown_ms":60000,"multi_session":true,"output_type":"rich","show_existing_sessions":false,"show_source":true,"slug":"vad"}} -->

# Nx Voice-Activity Detection

```elixir
Mix.install([
  {:ortex, "~> 0.1.9"},
  {:kino_vega_lite, "~> 0.1.10"},
  {:kino_live_audio, "~> 0.1"},
  {:req, "~> 0.4"}
])
```

## Setup Model & Plot

```elixir
url = "https://raw.githubusercontent.com/snakers4/silero-vad/master/files/silero_vad.onnx"
filename = url |> String.split("/") |> Enum.reverse() |> hd

resp =
  Req.get!(
    url,
    decode_body: false,
    into: File.stream!(filename)
  )

model = Ortex.load(filename)

chart =
  VegaLite.new(title: "Voice-Activated Detection", width: 800, height: 400)
  |> VegaLite.mark(:line)
  |> VegaLite.encode_field(:x, "x",
    type: :quantitative,
    title: "Time",
    axis: [ticks: false, domain: false, grid: false, labels: false]
  )
  |> VegaLite.encode_field(:y, "y",
    type: :quantitative,
    title: "Voice",
    scale: [domain_max: 1, domain_min: 0]
  )
  |> Kino.VegaLite.new()
```

```elixir
chunk_size = Kino.Input.text("Chunk Size", default: "1")
sample_rate = Kino.Input.text("Sample Rate", default: "16000")

unit =
  Kino.Input.select(
    "Unit",
    [samples: "Samples", s: "Seconds", ms: "Miliseconds", mu: "Microseconds"],
    default: :s
  )

clear = Kino.Control.button("Clear Plot")
clear |> Kino.listen(fn _ -> Kino.VegaLite.clear(chart) end)
top_row = Kino.Layout.grid([sample_rate, chunk_size, unit], columns: 3)
Kino.Layout.grid([top_row, clear])
```

```elixir
liveAudio =
  KinoLiveAudio.new(
    chunk_size: Kino.Input.read(chunk_size) |> Integer.parse() |> elem(0),
    unit: Kino.Input.read(unit),
    sample_rate: Kino.Input.read(sample_rate) |> Integer.parse() |> elem(0)
  )
```

```elixir
liveAudio
|> Kino.Control.stream()
|> Kino.listen({Nx.broadcast(0.0, {2, 1, 64}), Nx.broadcast(0.0, {2, 1, 64})}, fn
  %{event: :audio_chunk, chunk: data}, {hn, cn} ->
    input = Nx.tensor(data) |> Nx.stack()
    sr = Nx.tensor(Kino.Input.read(sample_rate) |> Integer.parse() |> elem(0), type: :s64)
    {input, sr, hn, cn}
    {output, hn, cn} = Ortex.run(model, {input, sr, hn, cn})
    [output] = Nx.to_list(output |> Nx.flatten())
    row = %{x: :os.system_time(), y: output}
    Kino.VegaLite.push(chart, row, window: 1000)
    {:cont, {hn, cn}}
end)
```