Add test stuff

This commit is contained in:
Kacper Donat 2020-06-21 22:14:29 +02:00
parent 72d7060fd8
commit 82ad7968e1
8 changed files with 303 additions and 63 deletions

8
.gitignore vendored
View File

@ -139,11 +139,9 @@ cython_debug/
# Generated docs
*.pdf
/dataset/wav/
!/dataset/wav/.gitkeep
/dataset/processed/
!/dataset/processed/.gitkeep
/dataset/*
!/dataset/midi/
!/dataset/*/.gitkeep
/dataset/manifest.csv
/eval/
metrics.csv

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"python.pythonPath": "/opt/conda/envs/magenta/bin/python"
}

View File

@ -12,4 +12,8 @@ dataset: manifest
test: dataset
onsets_frames_transcription_infer --model_dir="${MODEL_DIR}" --output_dir="./eval/" --examples_path=./dataset/processed/test.tfrecord* --hparams="use_cudnn=false" --preprocess_examples=True
.PHONY: clean
clean:
rm eval/*

19
clean.csv Normal file
View File

@ -0,0 +1,19 @@
,metric,value
0,final/metrics/frame_precision,0.730536699295044
1,final/metrics/frame_recall,0.8425199389457703
2,final/metrics/frame_f1_score,0.7802173495292664
3,final/metrics/frame_accuracy,0.9841498732566833
4,final/metrics/frame_accuracy_without_true_negatives,0.652758002281189
5,final/metrics/note_density,7.3658447265625
6,final/metrics/note_precision,0.5048999190330505
7,final/metrics/note_recall,0.4615851938724518
8,final/metrics/note_f1_score,0.48121312260627747
9,final/metrics/note_with_velocity_precision,0.34594792127609253
10,final/metrics/note_with_velocity_recall,0.3155739903450012
11,final/metrics/note_with_velocity_f1_score,0.32934126257896423
12,final/metrics/note_with_offsets_precision,0.32145705819129944
13,final/metrics/note_with_offsets_recall,0.29507842659950256
14,final/metrics/note_with_offsets_f1_score,0.3070283532142639
15,final/metrics/note_with_offsets_velocity_precision,0.22619099915027618
16,final/metrics/note_with_offsets_velocity_recall,0.2070828378200531
17,final/metrics/note_with_offsets_velocity_f1_score,0.21574667096138
1 metric value
2 0 final/metrics/frame_precision 0.730536699295044
3 1 final/metrics/frame_recall 0.8425199389457703
4 2 final/metrics/frame_f1_score 0.7802173495292664
5 3 final/metrics/frame_accuracy 0.9841498732566833
6 4 final/metrics/frame_accuracy_without_true_negatives 0.652758002281189
7 5 final/metrics/note_density 7.3658447265625
8 6 final/metrics/note_precision 0.5048999190330505
9 7 final/metrics/note_recall 0.4615851938724518
10 8 final/metrics/note_f1_score 0.48121312260627747
11 9 final/metrics/note_with_velocity_precision 0.34594792127609253
12 10 final/metrics/note_with_velocity_recall 0.3155739903450012
13 11 final/metrics/note_with_velocity_f1_score 0.32934126257896423
14 12 final/metrics/note_with_offsets_precision 0.32145705819129944
15 13 final/metrics/note_with_offsets_recall 0.29507842659950256
16 14 final/metrics/note_with_offsets_f1_score 0.3070283532142639
17 15 final/metrics/note_with_offsets_velocity_precision 0.22619099915027618
18 16 final/metrics/note_with_offsets_velocity_recall 0.2070828378200531
19 17 final/metrics/note_with_offsets_velocity_f1_score 0.21574667096138

108
colorednoise.py Normal file
View File

@ -0,0 +1,108 @@
"""Generate colored noise."""
from numpy import sqrt, newaxis
from numpy.fft import irfft, rfftfreq
from numpy.random import normal
from numpy import sum as npsum
def powerlaw_psd_gaussian(exponent, size, fmin=0):
"""Gaussian (1/f)**beta noise.
Based on the algorithm in:
Timmer, J. and Koenig, M.:
On generating power law noise.
Astron. Astrophys. 300, 707-710 (1995)
Normalised to unit variance
Parameters:
-----------
exponent : float
The power-spectrum of the generated noise is proportional to
S(f) = (1 / f)**beta
flicker / pink noise: exponent beta = 1
brown noise: exponent beta = 2
Furthermore, the autocorrelation decays proportional to lag**-gamma
with gamma = 1 - beta for 0 < beta < 1.
There may be finite-size issues for beta close to one.
shape : int or iterable
The output has the given shape, and the desired power spectrum in
the last coordinate. That is, the last dimension is taken as time,
and all other components are independent.
fmin : float, optional
Low-frequency cutoff.
Default: 0 corresponds to original paper. It is not actually
zero, but 1/samples.
Returns
-------
out : array
The samples.
Examples:
---------
# generate 1/f noise == pink noise == flicker noise
>>> import colorednoise as cn
>>> y = cn.powerlaw_psd_gaussian(1, 5)
"""
# Make sure size is a list so we can iterate it and assign to it.
try:
size = list(size)
except TypeError:
size = [size]
# The number of samples in each time series
samples = size[-1]
# Calculate Frequencies (we asume a sample rate of one)
# Use fft functions for real output (-> hermitian spectrum)
f = rfftfreq(samples)
# Build scaling factors for all frequencies
s_scale = f
fmin = max(fmin, 1./samples) # Low frequency cutoff
ix = npsum(s_scale < fmin) # Index of the cutoff
if ix and ix < len(s_scale):
s_scale[:ix] = s_scale[ix]
s_scale = s_scale**(-exponent/2.)
# Calculate theoretical output standard deviation from scaling
w = s_scale[1:].copy()
w[-1] *= (1 + (samples % 2)) / 2. # correct f = +-0.5
sigma = 2 * sqrt(npsum(w**2)) / samples
# Adjust size to generate one Fourier component per frequency
size[-1] = len(f)
# Add empty dimension(s) to broadcast s_scale along last
# dimension of generated random power + phase (below)
dims_to_add = len(size) - 1
s_scale = s_scale[(newaxis,) * dims_to_add + (Ellipsis,)]
# Generate scaled random power + phase
sr = normal(scale=s_scale, size=size)
si = normal(scale=s_scale, size=size)
# If the signal length is even, frequencies +/- 0.5 are equal
# so the coefficient must be real.
if not (samples % 2): si[...,-1] = 0
# Regardless of signal length, the DC component must be real
si[...,0] = 0
# Combine power + corrected phase to Fourier components
s = sr + 1J * si
# Transform to real time series & scale to unit variance
y = irfft(s, n=samples, axis=-1) / sigma
return y

55
degraded.csv Normal file
View File

@ -0,0 +1,55 @@
,metric,value
0,final/metrics/frame_precision,0.7293761372566223
1,final/metrics/frame_recall,0.8212106227874756
2,final/metrics/frame_f1_score,0.7701795697212219
3,final/metrics/frame_accuracy,0.9836516380310059
4,final/metrics/frame_accuracy_without_true_negatives,0.6388141512870789
5,final/metrics/note_density,7.263808250427246
6,final/metrics/note_precision,0.5048912763595581
7,final/metrics/note_recall,0.45291581749916077
8,final/metrics/note_f1_score,0.4764764904975891
9,final/metrics/note_with_velocity_precision,0.38284000754356384
10,final/metrics/note_with_velocity_recall,0.3431974947452545
11,final/metrics/note_with_velocity_f1_score,0.3611662983894348
12,final/metrics/note_with_offsets_precision,0.3148140609264374
13,final/metrics/note_with_offsets_recall,0.28389376401901245
14,final/metrics/note_with_offsets_f1_score,0.29793334007263184
15,final/metrics/note_with_offsets_velocity_precision,0.24326983094215393
16,final/metrics/note_with_offsets_velocity_recall,0.21928410232067108
17,final/metrics/note_with_offsets_velocity_f1_score,0.23017948865890503
0,final/metrics/frame_precision,0.6845414638519287
1,final/metrics/frame_recall,0.7084149718284607
2,final/metrics/frame_f1_score,0.6921667456626892
3,final/metrics/frame_accuracy,0.9789363741874695
4,final/metrics/frame_accuracy_without_true_negatives,0.5345737338066101
5,final/metrics/note_density,6.421718597412109
6,final/metrics/note_precision,0.49045559763908386
7,final/metrics/note_recall,0.3755749464035034
8,final/metrics/note_f1_score,0.42303866147994995
9,final/metrics/note_with_velocity_precision,0.3649778962135315
10,final/metrics/note_with_velocity_recall,0.27852118015289307
11,final/metrics/note_with_velocity_f1_score,0.31417202949523926
12,final/metrics/note_with_offsets_precision,0.2567719519138336
13,final/metrics/note_with_offsets_recall,0.19949588179588318
14,final/metrics/note_with_offsets_f1_score,0.22342373430728912
15,final/metrics/note_with_offsets_velocity_precision,0.19573232531547546
16,final/metrics/note_with_offsets_velocity_recall,0.15153546631336212
17,final/metrics/note_with_offsets_velocity_f1_score,0.16995804011821747
0,final/metrics/frame_precision,0.730536699295044
1,final/metrics/frame_recall,0.8425199389457703
2,final/metrics/frame_f1_score,0.7802173495292664
3,final/metrics/frame_accuracy,0.9841498732566833
4,final/metrics/frame_accuracy_without_true_negatives,0.652758002281189
5,final/metrics/note_density,7.3658447265625
6,final/metrics/note_precision,0.5048999190330505
7,final/metrics/note_recall,0.4615851938724518
8,final/metrics/note_f1_score,0.48121312260627747
9,final/metrics/note_with_velocity_precision,0.34594792127609253
10,final/metrics/note_with_velocity_recall,0.3155739903450012
11,final/metrics/note_with_velocity_f1_score,0.32934126257896423
12,final/metrics/note_with_offsets_precision,0.32145705819129944
13,final/metrics/note_with_offsets_recall,0.29507842659950256
14,final/metrics/note_with_offsets_f1_score,0.3070283532142639
15,final/metrics/note_with_offsets_velocity_precision,0.22619099915027618
16,final/metrics/note_with_offsets_velocity_recall,0.2070828378200531
17,final/metrics/note_with_offsets_velocity_f1_score,0.21574667096138
1 metric value
2 0 final/metrics/frame_precision 0.7293761372566223
3 1 final/metrics/frame_recall 0.8212106227874756
4 2 final/metrics/frame_f1_score 0.7701795697212219
5 3 final/metrics/frame_accuracy 0.9836516380310059
6 4 final/metrics/frame_accuracy_without_true_negatives 0.6388141512870789
7 5 final/metrics/note_density 7.263808250427246
8 6 final/metrics/note_precision 0.5048912763595581
9 7 final/metrics/note_recall 0.45291581749916077
10 8 final/metrics/note_f1_score 0.4764764904975891
11 9 final/metrics/note_with_velocity_precision 0.38284000754356384
12 10 final/metrics/note_with_velocity_recall 0.3431974947452545
13 11 final/metrics/note_with_velocity_f1_score 0.3611662983894348
14 12 final/metrics/note_with_offsets_precision 0.3148140609264374
15 13 final/metrics/note_with_offsets_recall 0.28389376401901245
16 14 final/metrics/note_with_offsets_f1_score 0.29793334007263184
17 15 final/metrics/note_with_offsets_velocity_precision 0.24326983094215393
18 16 final/metrics/note_with_offsets_velocity_recall 0.21928410232067108
19 17 final/metrics/note_with_offsets_velocity_f1_score 0.23017948865890503
20 0 final/metrics/frame_precision 0.6845414638519287
21 1 final/metrics/frame_recall 0.7084149718284607
22 2 final/metrics/frame_f1_score 0.6921667456626892
23 3 final/metrics/frame_accuracy 0.9789363741874695
24 4 final/metrics/frame_accuracy_without_true_negatives 0.5345737338066101
25 5 final/metrics/note_density 6.421718597412109
26 6 final/metrics/note_precision 0.49045559763908386
27 7 final/metrics/note_recall 0.3755749464035034
28 8 final/metrics/note_f1_score 0.42303866147994995
29 9 final/metrics/note_with_velocity_precision 0.3649778962135315
30 10 final/metrics/note_with_velocity_recall 0.27852118015289307
31 11 final/metrics/note_with_velocity_f1_score 0.31417202949523926
32 12 final/metrics/note_with_offsets_precision 0.2567719519138336
33 13 final/metrics/note_with_offsets_recall 0.19949588179588318
34 14 final/metrics/note_with_offsets_f1_score 0.22342373430728912
35 15 final/metrics/note_with_offsets_velocity_precision 0.19573232531547546
36 16 final/metrics/note_with_offsets_velocity_recall 0.15153546631336212
37 17 final/metrics/note_with_offsets_velocity_f1_score 0.16995804011821747
38 0 final/metrics/frame_precision 0.730536699295044
39 1 final/metrics/frame_recall 0.8425199389457703
40 2 final/metrics/frame_f1_score 0.7802173495292664
41 3 final/metrics/frame_accuracy 0.9841498732566833
42 4 final/metrics/frame_accuracy_without_true_negatives 0.652758002281189
43 5 final/metrics/note_density 7.3658447265625
44 6 final/metrics/note_precision 0.5048999190330505
45 7 final/metrics/note_recall 0.4615851938724518
46 8 final/metrics/note_f1_score 0.48121312260627747
47 9 final/metrics/note_with_velocity_precision 0.34594792127609253
48 10 final/metrics/note_with_velocity_recall 0.3155739903450012
49 11 final/metrics/note_with_velocity_f1_score 0.32934126257896423
50 12 final/metrics/note_with_offsets_precision 0.32145705819129944
51 13 final/metrics/note_with_offsets_recall 0.29507842659950256
52 14 final/metrics/note_with_offsets_f1_score 0.3070283532142639
53 15 final/metrics/note_with_offsets_velocity_precision 0.22619099915027618
54 16 final/metrics/note_with_offsets_velocity_recall 0.2070828378200531
55 17 final/metrics/note_with_offsets_velocity_f1_score 0.21574667096138

32
test.py Normal file
View File

@ -0,0 +1,32 @@
from argparse import ArgumentParser
from glob import glob
from transform_wav import transform_file
from os import system, environ, mkdir
parser = ArgumentParser(description="Preform transformation on wav files.")
parser.add_argument('-p', dest='pitch_shift_value', type=float, help="Pitch shift value.")
parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.")
parser.add_argument('-n', dest='noise_amplitude', type=float, help="Amplitude of noise.")
parser.add_argument('--input', '-i', dest='input_directory', help="Path to directory with input waves path.")
parser.add_argument('--output', '-o', dest='output_directory', help="Path to directory to output waves path.")
args = parser.parse_args()
try:
mkdir(args.output_directory)
except:
pass
for input_path in glob(f"{args.input_directory}/*.wav"):
output_path = input_path.replace(args.input_directory, args.output_directory)
print(f"Transforming {input_path} into {output_path}")
transform_file(
input_path, output_path,
pitch_shift_value = args.pitch_shift_value,
amplitude_multiplier_value = args.amplitude_multiplier_value,
noise_amplitude = args.noise_amplitude,
)
system(f'python /opt/conda/envs/magenta/lib/python3.7/site-packages/magenta/models/onsets_frames_transcription/onsets_frames_transcription_create_tfrecords.py --csv="./dataset/manifest.csv" --output_directory="./dataset/processed" --wav_dir="{args.output_directory}" --midi_dir="./dataset/midi" --expected_splits="test"')
system(f'onsets_frames_transcription_infer --model_dir="{environ["MODEL_DIR"]}" --output_dir="./eval/" --examples_path=./dataset/processed/test.tfrecord* --hparams="use_cudnn=false" --preprocess_examples=True')

View File

@ -3,91 +3,112 @@
Basic command-line tool used to transform .wav files.
@author: Szymon Szczyrbak
@author: Kacper Donat
"""
import wave
import numpy as np
import struct
import resampy
from argparse import ArgumentParser
parser = ArgumentParser(description="Preform transformation on wav files.")
parser.add_argument('--input', '-i', dest='input_path', help="Input wav file path.")
parser.add_argument('--output', '-o', dest='output_path', help="Output wav file path.")
parser.add_argument('-p', dest='pitch_shift_value', type=int, help="Pitch shift value.")
parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.")
from noise import pnoise1
from colorednoise import powerlaw_psd_gaussian
# https://stackoverflow.com/questions/43963982/python-change-pitch-of-wav-file
def shift_pitch(input_path, output_path, pitch_shift_value):
# Read input file.
wr = wave.open(input_path, 'r')
# Set the parameters for the output file.
par = list(wr.getparams())
par[3] = 0 # The number of samples will be set by writeframes.
par = tuple(par)
ww = wave.open(output_path, 'w')
ww.setparams(par)
fr = 20 # TODO: Not sure what does it do... Higher number reduces reverb?
sz = wr.getframerate()//fr # Number of samples processed in one loop iteration.
c = int(wr.getnframes()/sz) # number of samples /
shift = pitch_shift_value//fr
for num in range(c):
# Read chunk and split into left and right channels
da = np.fromstring(wr.readframes(sz), dtype=np.int16)
left, right = da[0::2], da[1::2]
# Extract the frequencies using FFT.
lf, rf = np.fft.rfft(left), np.fft.rfft(right)
# Increase the pitch by rolling arrays.
lf, rf = np.roll(lf, shift), np.roll(rf, shift)
# Highest frequencies rolled at the start of the array. Zero 'em.
if pitch_shift_value > 0:
lf[0:shift], rf[0:shift] = 0, 0
else:
lf[shift-1:-1], rf[shift-1:-1] = 0, 0 # TODO: Not sure if it's alright for negative shift.
# Inverse FFT.
nl, nr = np.fft.irfft(lf), np.fft.irfft(rf)
# Combine left and right channel.
ns = np.column_stack((nl, nr)).ravel().astype(np.int16)
# Write to output file.
ww.writeframes(ns.tostring())
wr.close()
ww.close()
def shift_pitch(da, wr, pitch_shift_value):
multiplier = 1.0 + ((pitch_shift_value - 1.0) * (2**(1/12) - 1.0)) * 0.1;
left, right = da[0::2], da[1::2]
# Extract the frequencies using FFT.
lf, rf = np.fft.rfft(left), np.fft.rfft(right)
size = len(lf)
new_size = int(size * multiplier)
lf, rf = resampy.resample(lf, size, new_size), resampy.resample(rf, size, new_size)
# Inverse FFT.
nl, nr = np.fft.irfft(lf), np.fft.irfft(rf)
# Combine left and right channel.
return np.column_stack((nl, nr)).ravel().astype(np.int16)
# https://stackoverflow.com/questions/13329617/change-the-volume-of-a-wav-file-in-python
def multiply_amplitude(input_path, output_path, amplitude_multiplier_value):
def multiply_amplitude(samples, wr, i, amplitude_multiplier_value):
amplitude_multiplier_value *= 2
noise = np.vectorize(lambda x: (1.0 - amplitude_multiplier_value / 2) + pnoise1((x + i) * 0.05 / wr.getframerate(), octaves=5) * amplitude_multiplier_value)
modulate = np.fromfunction(noise, samples.shape, dtype=float)
return (samples * modulate).astype(np.int16) # multiply amplitude
def add_noise(samples, wr, amplitude=0.1):
noise = powerlaw_psd_gaussian(1, samples.shape)
RMS = np.mean(samples ** 2)
An = np.sqrt(RMS * amplitude)
return (samples + noise * An).astype(np.int16)
def transform_file(input_path, output_path, pitch_shift_value=None, amplitude_multiplier_value=None, noise_amplitude=None):
# Read input file.
wr = wave.open(input_path, 'r')
samples_num = wr.getparams()[3]
# Set the parameters for the output file.
par = list(wr.getparams())
par[3] = 0 # The number of samples will be set by writeframes.
par = tuple(par)
# Open output file
ww = wave.open(output_path, 'w')
ww.setparams(par)
da = np.fromstring(wr.readframes(samples_num), np.int16) * amplitude_multiplier_value # multiply amplitude
da = da.astype(np.int16)
ns = struct.pack('h'*len(da), *da)
ww.writeframes(ns)
fr = 5
per_loop = wr.getframerate() // fr
count = wr.getnframes() // per_loop
for i in range(count):
ns = np.frombuffer(wr.readframes(per_loop), dtype=np.int16)
if isinstance(pitch_shift_value, float) and pitch_shift_value - 1.0 > 0.001:
ns = shift_pitch(ns, wr, pitch_shift_value)
if isinstance(amplitude_multiplier_value, float) and amplitude_multiplier_value > 0.01:
ns = multiply_amplitude(ns, wr, i * per_loop, amplitude_multiplier_value)
if isinstance(noise_amplitude, float) and noise_amplitude > 0.0:
ns = add_noise(ns, wr, noise_amplitude)
ww.writeframes(ns)
wr.close()
ww.close()
parser = ArgumentParser(description="Preform transformation on wav files.")
parser.add_argument('--input', '-i', dest='input_path', help="Input wav file path.")
parser.add_argument('--output', '-o', dest='output_path', help="Output wav file path.")
parser.add_argument('-p', dest='pitch_shift_value', type=float, help="Pitch shift value.")
parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.")
parser.add_argument('-n', dest='noise_amplitude', type=float, help="Amplitude of noise.")
def main():
args = parser.parse_args()
input_path = args.input_path
output_path = args.output_path
pitch_shift_value = args.pitch_shift_value
amplitude_multiplier_value = args.amplitude_multiplier_value
if isinstance(pitch_shift_value, int):
shift_pitch(input_path, output_path, pitch_shift_value)
elif isinstance(amplitude_multiplier_value, float):
# TODO: Weird noises for amplitude x5.
multiply_amplitude(input_path, output_path, amplitude_multiplier_value)
# TODO: More transformations. Should only one transformation be preformed in one run?
output_path = args.output_path
transform_file(
input_path, output_path,
pitch_shift_value=args.pitch_shift_value,
amplitude_multiplier_value=args.amplitude_multiplier_value,
noise_amplitude=args.noise_amplitude
)
if __name__ == "__main__":
main()