1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212# See the License for the specific language governing permissions and
1313# limitations under the License.
14+ """
15+ Parameters used in the audio pipeline
16+ These configure the following stages:
17+ - Conversion from audio to input vectors
18+ - Interpretation of the network output to a confidence value
19+ """
1420from math import floor
1521
1622import attr
2127
2228@attr .s (frozen = True )
2329class ListenerParams :
30+ """
31+ General pipeline information:
32+ - Audio goes through a series of transformations to convert raw audio into machine readable data
33+ - These transformations are as follows:
34+ - Raw audio -> chopped audio
35+ - buffer_t, sample_depth: Input audio loaded and truncated using these value
36+ - window_t, hop_t: Linear audio chopped into overlapping frames using a sliding window
37+ - Chopped audio -> FFT spectrogram
38+ - n_fft, sample_rate: Each audio frame is converted to n_fft frequency intensities
39+ - FFT spectrogram -> Mel spectrogram (compressed)
40+ - n_filt: Each fft frame is compressed to n_filt summarized mel frequency bins/bands
41+ - Mel spectrogram -> MFCC
42+ - n_mfcc: Each mel frame is converted to MFCCs and the first n_mfcc values are taken
43+ - Disabled by default: Last phase -> Delta vectors
44+ - use_delta: If this value is true, the difference between consecutive vectors is concatenated to each frame
45+
46+ Parameters for audio pipeline:
47+ - buffer_t: Input size of audio. Wakeword must fit within this time
48+ - window_t: Time of the window used to calculate a single spectrogram frame
49+ - hop_t: Time the window advances forward to calculate the next spectrogram frame
50+ - sample_rate: Input audio sample rate
51+ - sample_depth: Bytes per input audio sample
52+ - n_fft: Size of FFT to generate from audio frame
53+ - n_filt: Number of filters to compress FFT to
54+ - n_mfcc: Number of MFCC coefficients to use
55+ - use_delta: If True, generates "delta vectors" before sending to network
56+ - vectorizer: The type of input fed into the network. Options listed in class Vectorizer
57+ - threshold_config: Output distribution configuration automatically generated from precise-calc-threshold
58+ - threshold_center: Output distribution center automatically generated from precise-calc-threshold
59+ """
60+ buffer_t = attr .ib () # type: float
2461 window_t = attr .ib () # type: float
2562 hop_t = attr .ib () # type: float
26- buffer_t = attr .ib () # type: float
2763 sample_rate = attr .ib () # type: int
2864 sample_depth = attr .ib () # type: int
29- n_mfcc = attr .ib () # type: int
30- n_filt = attr .ib () # type: int
3165 n_fft = attr .ib () # type: int
66+ n_filt = attr .ib () # type: int
67+ n_mfcc = attr .ib () # type: int
3268 use_delta = attr .ib () # type: bool
3369 vectorizer = attr .ib () # type: int
3470 threshold_config = attr .ib () # type: tuple
3571 threshold_center = attr .ib () # type: float
3672
3773 @property
3874 def buffer_samples (self ):
75+ """buffer_t converted to samples, truncating partial frames"""
3976 samples = int (self .sample_rate * self .buffer_t + 0.5 )
4077 return self .hop_samples * (samples // self .hop_samples )
4178
4279 @property
4380 def n_features (self ):
81+ """Number of timesteps in one input to the network"""
4482 return 1 + int (floor ((self .buffer_samples - self .window_samples ) / self .hop_samples ))
4583
4684 @property
4785 def window_samples (self ):
86+ """window_t converted to samples"""
4887 return int (self .sample_rate * self .window_t + 0.5 )
4988
5089 @property
5190 def hop_samples (self ):
91+ """hop_t converted to samples"""
5292 return int (self .sample_rate * self .hop_t + 0.5 )
5393
5494 @property
5595 def max_samples (self ):
96+ """The input size converted to audio samples"""
5697 return int (self .buffer_t * self .sample_rate )
5798
5899 @property
59100 def feature_size (self ):
101+ """The size of an input vector generated with these parameters"""
60102 num_features = {
61103 Vectorizer .mfccs : self .n_mfcc ,
62104 Vectorizer .mels : self .n_filt ,
@@ -77,15 +119,27 @@ def vectorization_md5_hash(self):
77119
78120
79121class Vectorizer :
122+ """
123+ Chooses which function to call to vectorize audio
124+
125+ Options:
126+ mels: Convert to a compressed Mel spectrogram
127+ mfccs: Convert to a MFCC spectrogram
128+ speechpy_mfccs: Legacy option to convert to MFCCs using old library
129+ """
80130 mels = 1
81131 mfccs = 2
82132 speechpy_mfccs = 3
83133
84134
85135# Global listener parameters
136+ # These are the default values for all parameters
137+ # These were selected tentatively to balance CPU usage with accuracy
138+ # For the Hey Mycroft wake word, small changes to these parameters
139+ # did not make a significant difference in accuracy
86140pr = ListenerParams (
87- window_t = 0.1 , hop_t = 0.05 , buffer_t = 1.5 , sample_rate = 16000 ,
88- sample_depth = 2 , n_mfcc = 13 , n_filt = 20 , n_fft = 512 , use_delta = False ,
141+ buffer_t = 1.5 , window_t = 0.1 , hop_t = 0.05 , sample_rate = 16000 ,
142+ sample_depth = 2 , n_fft = 512 , n_filt = 20 , n_mfcc = 13 , use_delta = False ,
89143 threshold_config = ((6 , 4 ),), threshold_center = 0.2 , vectorizer = Vectorizer .mfccs
90144)
91145
0 commit comments