Sonotype Preprocessing

Sonotype Preprocessing#

Before we could begin training a model on the keystroke data and recover the password from the audio, we needed to split each audio file into individual files, with each one containing exactly one keypress. We found that the pydub module allowed us to automate this process relatively easily.

We also used data augmentation to both increase the amount of data available for training the model, as well as regularize the said model. We used multiple random combinations of four augmentation methods on each input file:

padding: adding a random amount of silence to the beginning and end of an audio clip.
cropping: removing the start and end of the audio clip.
volume: scaling each sample in the audio clip by a random amount.
noise: adding normally distributed noise to the entire audio clip.

import glob
import librosa
import numpy as np
from pydub import AudioSegment
from pydub.silence import split_on_silence
from random import choices, randint, shuffle, uniform
from scipy.io import wavfile

for path in sorted(glob.glob("./keystrokes/*.wav")):
    audio = AudioSegment.from_wav(path)
    chunks = split_on_silence(
        audio,
        min_silence_len=300,
        silence_thresh=audio.dBFS-8
    )
    stroke = path.split("/")[-1].split(".")[0].split("_")[-1]
    for i, c in enumerate(chunks):
        arr = np.array(c.get_array_of_samples())
        fname = stroke + "_" + str(i) + ".wav"
        wavfile.write(filename=f"./train/{fname}", rate=44100, data=arr)

AUGS = ['pad', 'crop', 'noise', 'volume']

def augpad(y):
    l = len(y)
    p0 = uniform(0.02, 0.15)
    p1 = uniform(0.02, 0.15)
    l0 = int(p0 * l)
    l1 = int(p1 * l)
    start = np.random.uniform(0, 0.005, (l0,))
    end = np.random.uniform(0, 0.005, (l1,))
    return np.concatenate((start, y, end))

def augcrop(y):
    l = len(y)
    p0 = uniform(0.02, 0.15)
    p1 = uniform(0.02, 0.15)
    l0 = int(p0 * l)
    l1 = int(p1 * l)
    return y[l0:-l1]

def augnoise(y):
    noise = np.random.normal(0, 0.001, y.shape)
    return y + noise

def augvol(y):
    f = 1 + np.random.uniform(-0.1, 0.1)
    return y * f

def augment(path):
    outs = []
    _y, _ = librosa.load(path, sr=22050)
    outs.append(_y)
    for _ in range(11):
        y, _ = librosa.load(path, sr=22050)
        n_augs = randint(1, 4)
        augs = choices(AUGS, k=n_augs)
        shuffle(augs)
        for aug in augs:
            if aug == 'pad':
                y = augpad(y)
            elif aug == 'crop':
                y = augcrop(y)
            elif aug == 'noise':
                y = augnoise(y)
            else:
                y = augvol(y)
        outs.append(y)
    return outs

paths_by_key = {}
for path in sorted(glob.glob("./train/*.wav")):
    key = path.split("/")[-1].split("_")[0]
    if key not in paths_by_key:
        paths_by_key[key] = []
    paths_by_key[key].append(path)

for key, paths in paths_by_key.items():
    i = 0
    for path in paths:
        outs = augment(path)
        for out in outs:
            outpath = "./train_aug/" + key + "_" + str(i).zfill(3) + ".wav"
            wavfile.write(outpath, 22050, out)
            i += 1