Add test stuff
This commit is contained in:
parent
72d7060fd8
commit
82ad7968e1
8
.gitignore
vendored
8
.gitignore
vendored
@ -139,11 +139,9 @@ cython_debug/
|
||||
|
||||
# Generated docs
|
||||
*.pdf
|
||||
|
||||
/dataset/wav/
|
||||
!/dataset/wav/.gitkeep
|
||||
/dataset/processed/
|
||||
!/dataset/processed/.gitkeep
|
||||
/dataset/*
|
||||
!/dataset/midi/
|
||||
!/dataset/*/.gitkeep
|
||||
/dataset/manifest.csv
|
||||
/eval/
|
||||
metrics.csv
|
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"python.pythonPath": "/opt/conda/envs/magenta/bin/python"
|
||||
}
|
6
Makefile
6
Makefile
@ -12,4 +12,8 @@ dataset: manifest
|
||||
|
||||
test: dataset
|
||||
onsets_frames_transcription_infer --model_dir="${MODEL_DIR}" --output_dir="./eval/" --examples_path=./dataset/processed/test.tfrecord* --hparams="use_cudnn=false" --preprocess_examples=True
|
||||
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
clean:
|
||||
rm eval/*
|
19
clean.csv
Normal file
19
clean.csv
Normal file
@ -0,0 +1,19 @@
|
||||
,metric,value
|
||||
0,final/metrics/frame_precision,0.730536699295044
|
||||
1,final/metrics/frame_recall,0.8425199389457703
|
||||
2,final/metrics/frame_f1_score,0.7802173495292664
|
||||
3,final/metrics/frame_accuracy,0.9841498732566833
|
||||
4,final/metrics/frame_accuracy_without_true_negatives,0.652758002281189
|
||||
5,final/metrics/note_density,7.3658447265625
|
||||
6,final/metrics/note_precision,0.5048999190330505
|
||||
7,final/metrics/note_recall,0.4615851938724518
|
||||
8,final/metrics/note_f1_score,0.48121312260627747
|
||||
9,final/metrics/note_with_velocity_precision,0.34594792127609253
|
||||
10,final/metrics/note_with_velocity_recall,0.3155739903450012
|
||||
11,final/metrics/note_with_velocity_f1_score,0.32934126257896423
|
||||
12,final/metrics/note_with_offsets_precision,0.32145705819129944
|
||||
13,final/metrics/note_with_offsets_recall,0.29507842659950256
|
||||
14,final/metrics/note_with_offsets_f1_score,0.3070283532142639
|
||||
15,final/metrics/note_with_offsets_velocity_precision,0.22619099915027618
|
||||
16,final/metrics/note_with_offsets_velocity_recall,0.2070828378200531
|
||||
17,final/metrics/note_with_offsets_velocity_f1_score,0.21574667096138
|
|
108
colorednoise.py
Normal file
108
colorednoise.py
Normal file
@ -0,0 +1,108 @@
|
||||
"""Generate colored noise."""
|
||||
|
||||
from numpy import sqrt, newaxis
|
||||
from numpy.fft import irfft, rfftfreq
|
||||
from numpy.random import normal
|
||||
from numpy import sum as npsum
|
||||
|
||||
|
||||
def powerlaw_psd_gaussian(exponent, size, fmin=0):
|
||||
"""Gaussian (1/f)**beta noise.
|
||||
|
||||
Based on the algorithm in:
|
||||
Timmer, J. and Koenig, M.:
|
||||
On generating power law noise.
|
||||
Astron. Astrophys. 300, 707-710 (1995)
|
||||
|
||||
Normalised to unit variance
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
|
||||
exponent : float
|
||||
The power-spectrum of the generated noise is proportional to
|
||||
|
||||
S(f) = (1 / f)**beta
|
||||
flicker / pink noise: exponent beta = 1
|
||||
brown noise: exponent beta = 2
|
||||
|
||||
Furthermore, the autocorrelation decays proportional to lag**-gamma
|
||||
with gamma = 1 - beta for 0 < beta < 1.
|
||||
There may be finite-size issues for beta close to one.
|
||||
|
||||
shape : int or iterable
|
||||
The output has the given shape, and the desired power spectrum in
|
||||
the last coordinate. That is, the last dimension is taken as time,
|
||||
and all other components are independent.
|
||||
|
||||
fmin : float, optional
|
||||
Low-frequency cutoff.
|
||||
Default: 0 corresponds to original paper. It is not actually
|
||||
zero, but 1/samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : array
|
||||
The samples.
|
||||
|
||||
|
||||
Examples:
|
||||
---------
|
||||
|
||||
# generate 1/f noise == pink noise == flicker noise
|
||||
>>> import colorednoise as cn
|
||||
>>> y = cn.powerlaw_psd_gaussian(1, 5)
|
||||
"""
|
||||
|
||||
# Make sure size is a list so we can iterate it and assign to it.
|
||||
try:
|
||||
size = list(size)
|
||||
except TypeError:
|
||||
size = [size]
|
||||
|
||||
# The number of samples in each time series
|
||||
samples = size[-1]
|
||||
|
||||
# Calculate Frequencies (we asume a sample rate of one)
|
||||
# Use fft functions for real output (-> hermitian spectrum)
|
||||
f = rfftfreq(samples)
|
||||
|
||||
# Build scaling factors for all frequencies
|
||||
s_scale = f
|
||||
fmin = max(fmin, 1./samples) # Low frequency cutoff
|
||||
ix = npsum(s_scale < fmin) # Index of the cutoff
|
||||
if ix and ix < len(s_scale):
|
||||
s_scale[:ix] = s_scale[ix]
|
||||
s_scale = s_scale**(-exponent/2.)
|
||||
|
||||
# Calculate theoretical output standard deviation from scaling
|
||||
w = s_scale[1:].copy()
|
||||
w[-1] *= (1 + (samples % 2)) / 2. # correct f = +-0.5
|
||||
sigma = 2 * sqrt(npsum(w**2)) / samples
|
||||
|
||||
# Adjust size to generate one Fourier component per frequency
|
||||
size[-1] = len(f)
|
||||
|
||||
# Add empty dimension(s) to broadcast s_scale along last
|
||||
# dimension of generated random power + phase (below)
|
||||
dims_to_add = len(size) - 1
|
||||
s_scale = s_scale[(newaxis,) * dims_to_add + (Ellipsis,)]
|
||||
|
||||
# Generate scaled random power + phase
|
||||
sr = normal(scale=s_scale, size=size)
|
||||
si = normal(scale=s_scale, size=size)
|
||||
|
||||
# If the signal length is even, frequencies +/- 0.5 are equal
|
||||
# so the coefficient must be real.
|
||||
if not (samples % 2): si[...,-1] = 0
|
||||
|
||||
# Regardless of signal length, the DC component must be real
|
||||
si[...,0] = 0
|
||||
|
||||
# Combine power + corrected phase to Fourier components
|
||||
s = sr + 1J * si
|
||||
|
||||
# Transform to real time series & scale to unit variance
|
||||
y = irfft(s, n=samples, axis=-1) / sigma
|
||||
|
||||
return y
|
55
degraded.csv
Normal file
55
degraded.csv
Normal file
@ -0,0 +1,55 @@
|
||||
,metric,value
|
||||
0,final/metrics/frame_precision,0.7293761372566223
|
||||
1,final/metrics/frame_recall,0.8212106227874756
|
||||
2,final/metrics/frame_f1_score,0.7701795697212219
|
||||
3,final/metrics/frame_accuracy,0.9836516380310059
|
||||
4,final/metrics/frame_accuracy_without_true_negatives,0.6388141512870789
|
||||
5,final/metrics/note_density,7.263808250427246
|
||||
6,final/metrics/note_precision,0.5048912763595581
|
||||
7,final/metrics/note_recall,0.45291581749916077
|
||||
8,final/metrics/note_f1_score,0.4764764904975891
|
||||
9,final/metrics/note_with_velocity_precision,0.38284000754356384
|
||||
10,final/metrics/note_with_velocity_recall,0.3431974947452545
|
||||
11,final/metrics/note_with_velocity_f1_score,0.3611662983894348
|
||||
12,final/metrics/note_with_offsets_precision,0.3148140609264374
|
||||
13,final/metrics/note_with_offsets_recall,0.28389376401901245
|
||||
14,final/metrics/note_with_offsets_f1_score,0.29793334007263184
|
||||
15,final/metrics/note_with_offsets_velocity_precision,0.24326983094215393
|
||||
16,final/metrics/note_with_offsets_velocity_recall,0.21928410232067108
|
||||
17,final/metrics/note_with_offsets_velocity_f1_score,0.23017948865890503
|
||||
0,final/metrics/frame_precision,0.6845414638519287
|
||||
1,final/metrics/frame_recall,0.7084149718284607
|
||||
2,final/metrics/frame_f1_score,0.6921667456626892
|
||||
3,final/metrics/frame_accuracy,0.9789363741874695
|
||||
4,final/metrics/frame_accuracy_without_true_negatives,0.5345737338066101
|
||||
5,final/metrics/note_density,6.421718597412109
|
||||
6,final/metrics/note_precision,0.49045559763908386
|
||||
7,final/metrics/note_recall,0.3755749464035034
|
||||
8,final/metrics/note_f1_score,0.42303866147994995
|
||||
9,final/metrics/note_with_velocity_precision,0.3649778962135315
|
||||
10,final/metrics/note_with_velocity_recall,0.27852118015289307
|
||||
11,final/metrics/note_with_velocity_f1_score,0.31417202949523926
|
||||
12,final/metrics/note_with_offsets_precision,0.2567719519138336
|
||||
13,final/metrics/note_with_offsets_recall,0.19949588179588318
|
||||
14,final/metrics/note_with_offsets_f1_score,0.22342373430728912
|
||||
15,final/metrics/note_with_offsets_velocity_precision,0.19573232531547546
|
||||
16,final/metrics/note_with_offsets_velocity_recall,0.15153546631336212
|
||||
17,final/metrics/note_with_offsets_velocity_f1_score,0.16995804011821747
|
||||
0,final/metrics/frame_precision,0.730536699295044
|
||||
1,final/metrics/frame_recall,0.8425199389457703
|
||||
2,final/metrics/frame_f1_score,0.7802173495292664
|
||||
3,final/metrics/frame_accuracy,0.9841498732566833
|
||||
4,final/metrics/frame_accuracy_without_true_negatives,0.652758002281189
|
||||
5,final/metrics/note_density,7.3658447265625
|
||||
6,final/metrics/note_precision,0.5048999190330505
|
||||
7,final/metrics/note_recall,0.4615851938724518
|
||||
8,final/metrics/note_f1_score,0.48121312260627747
|
||||
9,final/metrics/note_with_velocity_precision,0.34594792127609253
|
||||
10,final/metrics/note_with_velocity_recall,0.3155739903450012
|
||||
11,final/metrics/note_with_velocity_f1_score,0.32934126257896423
|
||||
12,final/metrics/note_with_offsets_precision,0.32145705819129944
|
||||
13,final/metrics/note_with_offsets_recall,0.29507842659950256
|
||||
14,final/metrics/note_with_offsets_f1_score,0.3070283532142639
|
||||
15,final/metrics/note_with_offsets_velocity_precision,0.22619099915027618
|
||||
16,final/metrics/note_with_offsets_velocity_recall,0.2070828378200531
|
||||
17,final/metrics/note_with_offsets_velocity_f1_score,0.21574667096138
|
|
32
test.py
Normal file
32
test.py
Normal file
@ -0,0 +1,32 @@
|
||||
from argparse import ArgumentParser
|
||||
from glob import glob
|
||||
from transform_wav import transform_file
|
||||
from os import system, environ, mkdir
|
||||
|
||||
parser = ArgumentParser(description="Preform transformation on wav files.")
|
||||
parser.add_argument('-p', dest='pitch_shift_value', type=float, help="Pitch shift value.")
|
||||
parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.")
|
||||
parser.add_argument('-n', dest='noise_amplitude', type=float, help="Amplitude of noise.")
|
||||
parser.add_argument('--input', '-i', dest='input_directory', help="Path to directory with input waves path.")
|
||||
parser.add_argument('--output', '-o', dest='output_directory', help="Path to directory to output waves path.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
mkdir(args.output_directory)
|
||||
except:
|
||||
pass
|
||||
|
||||
for input_path in glob(f"{args.input_directory}/*.wav"):
|
||||
output_path = input_path.replace(args.input_directory, args.output_directory)
|
||||
|
||||
print(f"Transforming {input_path} into {output_path}")
|
||||
transform_file(
|
||||
input_path, output_path,
|
||||
pitch_shift_value = args.pitch_shift_value,
|
||||
amplitude_multiplier_value = args.amplitude_multiplier_value,
|
||||
noise_amplitude = args.noise_amplitude,
|
||||
)
|
||||
|
||||
system(f'python /opt/conda/envs/magenta/lib/python3.7/site-packages/magenta/models/onsets_frames_transcription/onsets_frames_transcription_create_tfrecords.py --csv="./dataset/manifest.csv" --output_directory="./dataset/processed" --wav_dir="{args.output_directory}" --midi_dir="./dataset/midi" --expected_splits="test"')
|
||||
system(f'onsets_frames_transcription_infer --model_dir="{environ["MODEL_DIR"]}" --output_dir="./eval/" --examples_path=./dataset/processed/test.tfrecord* --hparams="use_cudnn=false" --preprocess_examples=True')
|
135
transform_wav.py
135
transform_wav.py
@ -3,91 +3,112 @@
|
||||
Basic command-line tool used to transform .wav files.
|
||||
|
||||
@author: Szymon Szczyrbak
|
||||
@author: Kacper Donat
|
||||
"""
|
||||
|
||||
import wave
|
||||
import numpy as np
|
||||
import struct
|
||||
import resampy
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser(description="Preform transformation on wav files.")
|
||||
parser.add_argument('--input', '-i', dest='input_path', help="Input wav file path.")
|
||||
parser.add_argument('--output', '-o', dest='output_path', help="Output wav file path.")
|
||||
parser.add_argument('-p', dest='pitch_shift_value', type=int, help="Pitch shift value.")
|
||||
parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.")
|
||||
from noise import pnoise1
|
||||
from colorednoise import powerlaw_psd_gaussian
|
||||
|
||||
# https://stackoverflow.com/questions/43963982/python-change-pitch-of-wav-file
|
||||
def shift_pitch(input_path, output_path, pitch_shift_value):
|
||||
# Read input file.
|
||||
wr = wave.open(input_path, 'r')
|
||||
# Set the parameters for the output file.
|
||||
par = list(wr.getparams())
|
||||
par[3] = 0 # The number of samples will be set by writeframes.
|
||||
par = tuple(par)
|
||||
ww = wave.open(output_path, 'w')
|
||||
ww.setparams(par)
|
||||
|
||||
fr = 20 # TODO: Not sure what does it do... Higher number reduces reverb?
|
||||
sz = wr.getframerate()//fr # Number of samples processed in one loop iteration.
|
||||
|
||||
c = int(wr.getnframes()/sz) # number of samples /
|
||||
shift = pitch_shift_value//fr
|
||||
for num in range(c):
|
||||
# Read chunk and split into left and right channels
|
||||
da = np.fromstring(wr.readframes(sz), dtype=np.int16)
|
||||
left, right = da[0::2], da[1::2]
|
||||
# Extract the frequencies using FFT.
|
||||
lf, rf = np.fft.rfft(left), np.fft.rfft(right)
|
||||
# Increase the pitch by rolling arrays.
|
||||
lf, rf = np.roll(lf, shift), np.roll(rf, shift)
|
||||
# Highest frequencies rolled at the start of the array. Zero 'em.
|
||||
if pitch_shift_value > 0:
|
||||
lf[0:shift], rf[0:shift] = 0, 0
|
||||
else:
|
||||
lf[shift-1:-1], rf[shift-1:-1] = 0, 0 # TODO: Not sure if it's alright for negative shift.
|
||||
# Inverse FFT.
|
||||
nl, nr = np.fft.irfft(lf), np.fft.irfft(rf)
|
||||
# Combine left and right channel.
|
||||
ns = np.column_stack((nl, nr)).ravel().astype(np.int16)
|
||||
# Write to output file.
|
||||
ww.writeframes(ns.tostring())
|
||||
wr.close()
|
||||
ww.close()
|
||||
def shift_pitch(da, wr, pitch_shift_value):
|
||||
multiplier = 1.0 + ((pitch_shift_value - 1.0) * (2**(1/12) - 1.0)) * 0.1;
|
||||
|
||||
left, right = da[0::2], da[1::2]
|
||||
|
||||
# Extract the frequencies using FFT.
|
||||
lf, rf = np.fft.rfft(left), np.fft.rfft(right)
|
||||
|
||||
size = len(lf)
|
||||
new_size = int(size * multiplier)
|
||||
|
||||
lf, rf = resampy.resample(lf, size, new_size), resampy.resample(rf, size, new_size)
|
||||
|
||||
# Inverse FFT.
|
||||
nl, nr = np.fft.irfft(lf), np.fft.irfft(rf)
|
||||
|
||||
# Combine left and right channel.
|
||||
return np.column_stack((nl, nr)).ravel().astype(np.int16)
|
||||
|
||||
# https://stackoverflow.com/questions/13329617/change-the-volume-of-a-wav-file-in-python
|
||||
def multiply_amplitude(input_path, output_path, amplitude_multiplier_value):
|
||||
def multiply_amplitude(samples, wr, i, amplitude_multiplier_value):
|
||||
amplitude_multiplier_value *= 2
|
||||
noise = np.vectorize(lambda x: (1.0 - amplitude_multiplier_value / 2) + pnoise1((x + i) * 0.05 / wr.getframerate(), octaves=5) * amplitude_multiplier_value)
|
||||
|
||||
modulate = np.fromfunction(noise, samples.shape, dtype=float)
|
||||
|
||||
return (samples * modulate).astype(np.int16) # multiply amplitude
|
||||
|
||||
|
||||
def add_noise(samples, wr, amplitude=0.1):
|
||||
noise = powerlaw_psd_gaussian(1, samples.shape)
|
||||
|
||||
RMS = np.mean(samples ** 2)
|
||||
An = np.sqrt(RMS * amplitude)
|
||||
|
||||
return (samples + noise * An).astype(np.int16)
|
||||
|
||||
def transform_file(input_path, output_path, pitch_shift_value=None, amplitude_multiplier_value=None, noise_amplitude=None):
|
||||
# Read input file.
|
||||
wr = wave.open(input_path, 'r')
|
||||
samples_num = wr.getparams()[3]
|
||||
|
||||
# Set the parameters for the output file.
|
||||
par = list(wr.getparams())
|
||||
par[3] = 0 # The number of samples will be set by writeframes.
|
||||
par = tuple(par)
|
||||
|
||||
# Open output file
|
||||
ww = wave.open(output_path, 'w')
|
||||
ww.setparams(par)
|
||||
|
||||
da = np.fromstring(wr.readframes(samples_num), np.int16) * amplitude_multiplier_value # multiply amplitude
|
||||
da = da.astype(np.int16)
|
||||
ns = struct.pack('h'*len(da), *da)
|
||||
ww.writeframes(ns)
|
||||
|
||||
fr = 5
|
||||
per_loop = wr.getframerate() // fr
|
||||
count = wr.getnframes() // per_loop
|
||||
|
||||
for i in range(count):
|
||||
ns = np.frombuffer(wr.readframes(per_loop), dtype=np.int16)
|
||||
|
||||
if isinstance(pitch_shift_value, float) and pitch_shift_value - 1.0 > 0.001:
|
||||
ns = shift_pitch(ns, wr, pitch_shift_value)
|
||||
|
||||
if isinstance(amplitude_multiplier_value, float) and amplitude_multiplier_value > 0.01:
|
||||
ns = multiply_amplitude(ns, wr, i * per_loop, amplitude_multiplier_value)
|
||||
|
||||
if isinstance(noise_amplitude, float) and noise_amplitude > 0.0:
|
||||
ns = add_noise(ns, wr, noise_amplitude)
|
||||
|
||||
ww.writeframes(ns)
|
||||
|
||||
wr.close()
|
||||
ww.close()
|
||||
|
||||
|
||||
parser = ArgumentParser(description="Preform transformation on wav files.")
|
||||
|
||||
parser.add_argument('--input', '-i', dest='input_path', help="Input wav file path.")
|
||||
parser.add_argument('--output', '-o', dest='output_path', help="Output wav file path.")
|
||||
|
||||
parser.add_argument('-p', dest='pitch_shift_value', type=float, help="Pitch shift value.")
|
||||
parser.add_argument('-a', dest='amplitude_multiplier_value', type=float, help="Amplitude multiplier value.")
|
||||
parser.add_argument('-n', dest='noise_amplitude', type=float, help="Amplitude of noise.")
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
input_path = args.input_path
|
||||
output_path = args.output_path
|
||||
pitch_shift_value = args.pitch_shift_value
|
||||
amplitude_multiplier_value = args.amplitude_multiplier_value
|
||||
if isinstance(pitch_shift_value, int):
|
||||
shift_pitch(input_path, output_path, pitch_shift_value)
|
||||
elif isinstance(amplitude_multiplier_value, float):
|
||||
# TODO: Weird noises for amplitude x5.
|
||||
multiply_amplitude(input_path, output_path, amplitude_multiplier_value)
|
||||
# TODO: More transformations. Should only one transformation be preformed in one run?
|
||||
output_path = args.output_path
|
||||
|
||||
transform_file(
|
||||
input_path, output_path,
|
||||
pitch_shift_value=args.pitch_shift_value,
|
||||
amplitude_multiplier_value=args.amplitude_multiplier_value,
|
||||
noise_amplitude=args.noise_amplitude
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user