From 82ad7968e16de0dc98e984acb4252db978eefd0c Mon Sep 17 00:00:00 2001 From: Kacper Donat Date: Sun, 21 Jun 2020 22:14:29 +0200 Subject: [PATCH] Add test stuff --- .gitignore | 8 +-- .vscode/settings.json | 3 + Makefile | 6 +- clean.csv | 19 ++++++ colorednoise.py | 108 +++++++++++++++++++++++++++++++++ degraded.csv | 55 +++++++++++++++++ test.py | 32 ++++++++++ transform_wav.py | 135 ++++++++++++++++++++++++------------------ 8 files changed, 303 insertions(+), 63 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 clean.csv create mode 100644 colorednoise.py create mode 100644 degraded.csv create mode 100644 test.py diff --git a/.gitignore b/.gitignore index accea65..0ce1766 100644 --- a/.gitignore +++ b/.gitignore @@ -139,11 +139,9 @@ cython_debug/ # Generated docs *.pdf - -/dataset/wav/ -!/dataset/wav/.gitkeep -/dataset/processed/ -!/dataset/processed/.gitkeep +/dataset/* +!/dataset/midi/ +!/dataset/*/.gitkeep /dataset/manifest.csv /eval/ metrics.csv \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..42be726 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/opt/conda/envs/magenta/bin/python" +} \ No newline at end of file diff --git a/Makefile b/Makefile index 3ab46d2..161ba13 100644 --- a/Makefile +++ b/Makefile @@ -12,4 +12,8 @@ dataset: manifest test: dataset onsets_frames_transcription_infer --model_dir="${MODEL_DIR}" --output_dir="./eval/" --examples_path=./dataset/processed/test.tfrecord* --hparams="use_cudnn=false" --preprocess_examples=True - \ No newline at end of file + +.PHONY: clean + +clean: + rm eval/* \ No newline at end of file diff --git a/clean.csv b/clean.csv new file mode 100644 index 0000000..5b5ddb1 --- /dev/null +++ b/clean.csv @@ -0,0 +1,19 @@ +,metric,value +0,final/metrics/frame_precision,0.730536699295044 +1,final/metrics/frame_recall,0.8425199389457703 +2,final/metrics/frame_f1_score,0.7802173495292664 +3,final/metrics/frame_accuracy,0.9841498732566833 +4,final/metrics/frame_accuracy_without_true_negatives,0.652758002281189 +5,final/metrics/note_density,7.3658447265625 +6,final/metrics/note_precision,0.5048999190330505 +7,final/metrics/note_recall,0.4615851938724518 +8,final/metrics/note_f1_score,0.48121312260627747 +9,final/metrics/note_with_velocity_precision,0.34594792127609253 +10,final/metrics/note_with_velocity_recall,0.3155739903450012 +11,final/metrics/note_with_velocity_f1_score,0.32934126257896423 +12,final/metrics/note_with_offsets_precision,0.32145705819129944 +13,final/metrics/note_with_offsets_recall,0.29507842659950256 +14,final/metrics/note_with_offsets_f1_score,0.3070283532142639 +15,final/metrics/note_with_offsets_velocity_precision,0.22619099915027618 +16,final/metrics/note_with_offsets_velocity_recall,0.2070828378200531 +17,final/metrics/note_with_offsets_velocity_f1_score,0.21574667096138 diff --git a/colorednoise.py b/colorednoise.py new file mode 100644 index 0000000..051818e --- /dev/null +++ b/colorednoise.py @@ -0,0 +1,108 @@ +"""Generate colored noise.""" + +from numpy import sqrt, newaxis +from numpy.fft import irfft, rfftfreq +from numpy.random import normal +from numpy import sum as npsum + + +def powerlaw_psd_gaussian(exponent, size, fmin=0): + """Gaussian (1/f)**beta noise. + + Based on the algorithm in: + Timmer, J. and Koenig, M.: + On generating power law noise. + Astron. Astrophys. 300, 707-710 (1995) + + Normalised to unit variance + + Parameters: + ----------- + + exponent : float + The power-spectrum of the generated noise is proportional to + + S(f) = (1 / f)**beta + flicker / pink noise: exponent beta = 1 + brown noise: exponent beta = 2 + + Furthermore, the autocorrelation decays proportional to lag**-gamma + with gamma = 1 - beta for 0 < beta < 1. + There may be finite-size issues for beta close to one. + + shape : int or iterable + The output has the given shape, and the desired power spectrum in + the last coordinate. That is, the last dimension is taken as time, + and all other components are independent. + + fmin : float, optional + Low-frequency cutoff. + Default: 0 corresponds to original paper. It is not actually + zero, but 1/samples. + + Returns + ------- + out : array + The samples. + + + Examples: + --------- + + # generate 1/f noise == pink noise == flicker noise + >>> import colorednoise as cn + >>> y = cn.powerlaw_psd_gaussian(1, 5) + """ + + # Make sure size is a list so we can iterate it and assign to it. + try: + size = list(size) + except TypeError: + size = [size] + + # The number of samples in each time series + samples = size[-1] + + # Calculate Frequencies (we asume a sample rate of one) + # Use fft functions for real output (-> hermitian spectrum) + f = rfftfreq(samples) + + # Build scaling factors for all frequencies + s_scale = f + fmin = max(fmin, 1./samples) # Low frequency cutoff + ix = npsum(s_scale < fmin) # Index of the cutoff + if ix and ix < len(s_scale): + s_scale[:ix] = s_scale[ix] + s_scale = s_scale**(-exponent/2.) + + # Calculate theoretical output standard deviation from scaling + w = s_scale[1:].copy() + w[-1] *= (1 + (samples % 2)) / 2. # correct f = +-0.5 + sigma = 2 * sqrt(npsum(w**2)) / samples + + # Adjust size to generate one Fourier component per frequency + size[-1] = len(f) + + # Add empty dimension(s) to broadcast s_scale along last + # dimension of generated random power + phase (below) + dims_to_add = len(size) - 1 + s_scale = s_scale[(newaxis,) * dims_to_add + (Ellipsis,)] + + # Generate scaled random power + phase + sr = normal(scale=s_scale, size=size) + si = normal(scale=s_scale, size=size) + + # If the signal length is even, frequencies +/- 0.5 are equal + # so the coefficient must be real. + if not (samples % 2): si[...,-1] = 0 + + # Regardless of signal length, the DC component must be real + si[...,0] = 0 + + # Combine power + corrected phase to Fourier components + s = sr + 1J * si + + # Transform to real time series & scale to unit variance + y = irfft(s, n=samples, axis=-1) / sigma + + return y diff --git a/degraded.csv b/degraded.csv new file mode 100644 index 0000000..102adea --- /dev/null +++ b/degraded.csv @@ -0,0 +1,55 @@ +,metric,value +0,final/metrics/frame_precision,0.7293761372566223 +1,final/metrics/frame_recall,0.8212106227874756 +2,final/metrics/frame_f1_score,0.7701795697212219 +3,final/metrics/frame_accuracy,0.9836516380310059 +4,final/metrics/frame_accuracy_without_true_negatives,0.6388141512870789 +5,final/metrics/note_density,7.263808250427246 +6,final/metrics/note_precision,0.5048912763595581 +7,final/metrics/note_recall,0.45291581749916077 +8,final/metrics/note_f1_score,0.4764764904975891 +9,final/metrics/note_with_velocity_precision,0.38284000754356384 +10,final/metrics/note_with_velocity_recall,0.3431974947452545 +11,final/metrics/note_with_velocity_f1_score,0.3611662983894348 +12,final/metrics/note_with_offsets_precision,0.3148140609264374 +13,final/metrics/note_with_offsets_recall,0.28389376401901245 +14,final/metrics/note_with_offsets_f1_score,0.29793334007263184 +15,final/metrics/note_with_offsets_velocity_precision,0.24326983094215393 +16,final/metrics/note_with_offsets_velocity_recall,0.21928410232067108 +17,final/metrics/note_with_offsets_velocity_f1_score,0.23017948865890503 +0,final/metrics/frame_precision,0.6845414638519287 +1,final/metrics/frame_recall,0.7084149718284607 +2,final/metrics/frame_f1_score,0.6921667456626892 +3,final/metrics/frame_accuracy,0.9789363741874695 +4,final/metrics/frame_accuracy_without_true_negatives,0.5345737338066101 +5,final/metrics/note_density,6.421718597412109 +6,final/metrics/note_precision,0.49045559763908386 +7,final/metrics/note_recall,0.3755749464035034 +8,final/metrics/note_f1_score,0.42303866147994995 +9,final/metrics/note_with_velocity_precision,0.3649778962135315 +10,final/metrics/note_with_velocity_recall,0.27852118015289307 +11,final/metrics/note_with_velocity_f1_score,0.31417202949523926 +12,final/metrics/note_with_offsets_precision,0.2567719519138336 +13,final/metrics/note_with_offsets_recall,0.19949588179588318 +14,final/metrics/note_with_offsets_f1_score,0.22342373430728912 +15,final/metrics/note_with_offsets_velocity_precision,0.19573232531547546 +16,final/metrics/note_with_offsets_velocity_recall,0.15153546631336212 +17,final/metrics/note_with_offsets_velocity_f1_score,0.16995804011821747 +0,final/metrics/frame_precision,0.730536699295044 +1,final/metrics/frame_recall,0.8425199389457703 +2,final/metrics/frame_f1_score,0.7802173495292664 +3,final/metrics/frame_accuracy,0.9841498732566833 +4,final/metrics/frame_accuracy_without_true_negatives,0.652758002281189 +5,final/metrics/note_density,7.3658447265625 +6,final/metrics/note_precision,0.5048999190330505 +7,final/metrics/note_recall,0.4615851938724518 +8,final/metrics/note_f1_score,0.48121312260627747 +9,final/metrics/note_with_velocity_precision,0.34594792127609253 +10,final/metrics/note_with_velocity_recall,0.3155739903450012 +11,final/metrics/note_with_velocity_f1_score,0.32934126257896423 +12,final/metrics/note_with_offsets_precision,0.32145705819129944 +13,final/metrics/note_with_offsets_recall,0.29507842659950256 +14,final/metrics/note_with_offsets_f1_score,0.3070283532142639 +15,final/metrics/note_with_offsets_velocity_precision,0.22619099915027618 +16,final/metrics/note_with_offsets_velocity_recall,0.2070828378200531 +17,final/metrics/note_with_offsets_velocity_f1_score,0.21574667096138 diff --git a/test.py b/test.py new file mode 100644 index 0000000..d0f697f --- /dev/null +++ b/test.py @@ -0,0 +1,32 @@ +from argparse import ArgumentParser +from glob import glob +from transform_wav import transform_file +from os import system, environ, mkdir + +parser = ArgumentParser(description="Preform transformation on wav files.") +parser.add_argument('-p', dest='pitch_shift_value', type=float, help="Pitch shift value.") +parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.") +parser.add_argument('-n', dest='noise_amplitude', type=float, help="Amplitude of noise.") +parser.add_argument('--input', '-i', dest='input_directory', help="Path to directory with input waves path.") +parser.add_argument('--output', '-o', dest='output_directory', help="Path to directory to output waves path.") + +args = parser.parse_args() + +try: + mkdir(args.output_directory) +except: + pass + +for input_path in glob(f"{args.input_directory}/*.wav"): + output_path = input_path.replace(args.input_directory, args.output_directory) + + print(f"Transforming {input_path} into {output_path}") + transform_file( + input_path, output_path, + pitch_shift_value = args.pitch_shift_value, + amplitude_multiplier_value = args.amplitude_multiplier_value, + noise_amplitude = args.noise_amplitude, + ) + +system(f'python /opt/conda/envs/magenta/lib/python3.7/site-packages/magenta/models/onsets_frames_transcription/onsets_frames_transcription_create_tfrecords.py --csv="./dataset/manifest.csv" --output_directory="./dataset/processed" --wav_dir="{args.output_directory}" --midi_dir="./dataset/midi" --expected_splits="test"') +system(f'onsets_frames_transcription_infer --model_dir="{environ["MODEL_DIR"]}" --output_dir="./eval/" --examples_path=./dataset/processed/test.tfrecord* --hparams="use_cudnn=false" --preprocess_examples=True') \ No newline at end of file diff --git a/transform_wav.py b/transform_wav.py index ba38a26..9955b17 100644 --- a/transform_wav.py +++ b/transform_wav.py @@ -3,91 +3,112 @@ Basic command-line tool used to transform .wav files. @author: Szymon Szczyrbak +@author: Kacper Donat """ import wave import numpy as np import struct +import resampy from argparse import ArgumentParser - -parser = ArgumentParser(description="Preform transformation on wav files.") -parser.add_argument('--input', '-i', dest='input_path', help="Input wav file path.") -parser.add_argument('--output', '-o', dest='output_path', help="Output wav file path.") -parser.add_argument('-p', dest='pitch_shift_value', type=int, help="Pitch shift value.") -parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.") +from noise import pnoise1 +from colorednoise import powerlaw_psd_gaussian # https://stackoverflow.com/questions/43963982/python-change-pitch-of-wav-file -def shift_pitch(input_path, output_path, pitch_shift_value): - # Read input file. - wr = wave.open(input_path, 'r') - # Set the parameters for the output file. - par = list(wr.getparams()) - par[3] = 0 # The number of samples will be set by writeframes. - par = tuple(par) - ww = wave.open(output_path, 'w') - ww.setparams(par) - - fr = 20 # TODO: Not sure what does it do... Higher number reduces reverb? - sz = wr.getframerate()//fr # Number of samples processed in one loop iteration. - - c = int(wr.getnframes()/sz) # number of samples / - shift = pitch_shift_value//fr - for num in range(c): - # Read chunk and split into left and right channels - da = np.fromstring(wr.readframes(sz), dtype=np.int16) - left, right = da[0::2], da[1::2] - # Extract the frequencies using FFT. - lf, rf = np.fft.rfft(left), np.fft.rfft(right) - # Increase the pitch by rolling arrays. - lf, rf = np.roll(lf, shift), np.roll(rf, shift) - # Highest frequencies rolled at the start of the array. Zero 'em. - if pitch_shift_value > 0: - lf[0:shift], rf[0:shift] = 0, 0 - else: - lf[shift-1:-1], rf[shift-1:-1] = 0, 0 # TODO: Not sure if it's alright for negative shift. - # Inverse FFT. - nl, nr = np.fft.irfft(lf), np.fft.irfft(rf) - # Combine left and right channel. - ns = np.column_stack((nl, nr)).ravel().astype(np.int16) - # Write to output file. - ww.writeframes(ns.tostring()) - wr.close() - ww.close() +def shift_pitch(da, wr, pitch_shift_value): + multiplier = 1.0 + ((pitch_shift_value - 1.0) * (2**(1/12) - 1.0)) * 0.1; + left, right = da[0::2], da[1::2] + + # Extract the frequencies using FFT. + lf, rf = np.fft.rfft(left), np.fft.rfft(right) + + size = len(lf) + new_size = int(size * multiplier) + + lf, rf = resampy.resample(lf, size, new_size), resampy.resample(rf, size, new_size) + + # Inverse FFT. + nl, nr = np.fft.irfft(lf), np.fft.irfft(rf) + + # Combine left and right channel. + return np.column_stack((nl, nr)).ravel().astype(np.int16) # https://stackoverflow.com/questions/13329617/change-the-volume-of-a-wav-file-in-python -def multiply_amplitude(input_path, output_path, amplitude_multiplier_value): +def multiply_amplitude(samples, wr, i, amplitude_multiplier_value): + amplitude_multiplier_value *= 2 + noise = np.vectorize(lambda x: (1.0 - amplitude_multiplier_value / 2) + pnoise1((x + i) * 0.05 / wr.getframerate(), octaves=5) * amplitude_multiplier_value) + + modulate = np.fromfunction(noise, samples.shape, dtype=float) + + return (samples * modulate).astype(np.int16) # multiply amplitude + + +def add_noise(samples, wr, amplitude=0.1): + noise = powerlaw_psd_gaussian(1, samples.shape) + + RMS = np.mean(samples ** 2) + An = np.sqrt(RMS * amplitude) + + return (samples + noise * An).astype(np.int16) + +def transform_file(input_path, output_path, pitch_shift_value=None, amplitude_multiplier_value=None, noise_amplitude=None): # Read input file. wr = wave.open(input_path, 'r') samples_num = wr.getparams()[3] + # Set the parameters for the output file. par = list(wr.getparams()) par[3] = 0 # The number of samples will be set by writeframes. par = tuple(par) + + # Open output file ww = wave.open(output_path, 'w') ww.setparams(par) - - da = np.fromstring(wr.readframes(samples_num), np.int16) * amplitude_multiplier_value # multiply amplitude - da = da.astype(np.int16) - ns = struct.pack('h'*len(da), *da) - ww.writeframes(ns) + + fr = 5 + per_loop = wr.getframerate() // fr + count = wr.getnframes() // per_loop + + for i in range(count): + ns = np.frombuffer(wr.readframes(per_loop), dtype=np.int16) + + if isinstance(pitch_shift_value, float) and pitch_shift_value - 1.0 > 0.001: + ns = shift_pitch(ns, wr, pitch_shift_value) + + if isinstance(amplitude_multiplier_value, float) and amplitude_multiplier_value > 0.01: + ns = multiply_amplitude(ns, wr, i * per_loop, amplitude_multiplier_value) + + if isinstance(noise_amplitude, float) and noise_amplitude > 0.0: + ns = add_noise(ns, wr, noise_amplitude) + + ww.writeframes(ns) wr.close() ww.close() +parser = ArgumentParser(description="Preform transformation on wav files.") + +parser.add_argument('--input', '-i', dest='input_path', help="Input wav file path.") +parser.add_argument('--output', '-o', dest='output_path', help="Output wav file path.") + +parser.add_argument('-p', dest='pitch_shift_value', type=float, help="Pitch shift value.") +parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.") +parser.add_argument('-n', dest='noise_amplitude', type=float, help="Amplitude of noise.") + def main(): args = parser.parse_args() + input_path = args.input_path - output_path = args.output_path - pitch_shift_value = args.pitch_shift_value - amplitude_multiplier_value = args.amplitude_multiplier_value - if isinstance(pitch_shift_value, int): - shift_pitch(input_path, output_path, pitch_shift_value) - elif isinstance(amplitude_multiplier_value, float): - # TODO: Weird noises for amplitude x5. - multiply_amplitude(input_path, output_path, amplitude_multiplier_value) - # TODO: More transformations. Should only one transformation be preformed in one run? + output_path = args.output_path + + transform_file( + input_path, output_path, + pitch_shift_value=args.pitch_shift_value, + amplitude_multiplier_value=args.amplitude_multiplier_value, + noise_amplitude=args.noise_amplitude + ) if __name__ == "__main__": main() \ No newline at end of file