본문 바로가기

IT 관련

임시)음성 파일 전처리

336x280(권장), 300x250(권장), 250x250, 200x200 크기의 광고 코드만 넣을 수 있습니다.
kaggle_example_test
In [1]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display
from matplotlib.pyplot import imshow

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd
from PIL import Image

%matplotlib inline
In [2]:
def ishow(data):
    
    height, width = data.shape[0], data.shape[1] #in pixels
    tmin = np.amin(data)
    tmax = np.amax(data)
    data = np.add(data, -tmin, casting='unsafe')
    size = width * 10, height * 10
    data *= 255. / (tmax - tmin - 1)

    im = Image.fromarray(data)
    imshow(im)
In [3]:
train_audio_path = 'G:\\datalab\\train'
filename = '\\right\\00b01445_nohash_0.wav'
sample_rate, samples = wavfile.read(str(train_audio_path) + filename)

spectrogram 값과 이미지화

In [4]:
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
print(spectrogram)
im = Image.fromarray(spectrogram.T)
imshow(im)
[[  5.71520207e-03   3.11515573e-02   7.28122715e-04 ...,   6.21214211e-01
    3.10687721e-01   2.14916444e+00]
 [  3.12881768e-02   2.09274516e-02   2.94294208e-02 ...,   4.13140982e-01
    3.66510063e-01   5.49426079e-02]
 [  1.87255666e-01   3.52619253e-02   2.06235528e-01 ...,   6.58459485e-01
    4.37955409e-01   1.30071059e-01]
 ..., 
 [  1.24835360e-05   1.60410273e-05   3.22900923e-05 ...,   7.70110728e-06
    1.92631546e-06   5.55872521e-06]
 [  5.48870275e-06   3.53630571e-06   2.52078479e-07 ...,   6.32678484e-06
    2.36373508e-05   1.43080879e-05]
 [  1.24635122e-07   5.05093658e-06   9.77927084e-06 ...,   3.50720279e-06
    4.36527662e-05   4.44699072e-06]]
Out[4]:
<matplotlib.image.AxesImage at 0x1d349ca9ac8>

log spectrogram 값과 이미지화

In [6]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)
In [7]:
freqs, times, spectrogram = log_specgram(samples, sample_rate)

ishow(spectrogram)
'''
height, width = spectrogram.shape[0], spectrogram.shape[1] #in pixels
tmin = np.amin(spectrogram)
tmax = np.amax(spectrogram)
mfccs = np.add(spectrogram, -tmin, casting='unsafe')
size = width * 10, height * 10
spectrogram *= 255. / (tmax - tmin - 1)

im = Image.fromarray(spectrogram)
imshow(im)
'''

freqs, times, spectrogram = log_specgram(samples, sample_rate)

fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.plot(np.linspace(0, sample_rate/len(samples), len(samples)), samples)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
Out[7]:
<matplotlib.image.AxesImage at 0x1d34b0f5898>
In [15]:
imshow(spectrogram.T, aspect='auto', origin='lower')
Out[15]:
<matplotlib.image.AxesImage at 0x1d34bde3438>

melspectrogram 값과 이미지화

In [8]:
S = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128)

ishow(S.T)

# Convert to log scale (dB). We'll use the peak power (max) as reference.
log_S = librosa.power_to_db(S, ref=np.max)

ishow(log_S.T)

plt.figure(figsize=(12, 4))
librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')
plt.title('Mel power spectrogram ')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

MFCC 값 이미지화

In [9]:
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20)

# Let's pad on the first and second deltas while we're at it
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
print(delta2_mfcc.shape)
plt.figure(figsize=(12, 4))
librosa.display.specshow(delta2_mfcc)
plt.ylabel('MFCC coeffs')
plt.xlabel('Time')
plt.title('MFCC')
plt.colorbar()
plt.tight_layout()
(20, 29)
In [10]:
r, d = librosa.load(str(train_audio_path) + filename)
#print(r,d)
mfccs = librosa.feature.mfcc(y=r, sr=d, n_mfcc=20)
mfccs = librosa.feature.delta(mfccs, order=2)
height, width = mfccs.shape[0], mfccs.shape[1] #in pixels
print(mfccs.shape)
tmin = np.amin(mfccs)
tmax = np.amax(mfccs)
mfccs = np.add(mfccs, -tmin, casting='unsafe')
size = width * 10, height * 10
mfccs *= 255. / (tmax - tmin - 1)
im = Image.fromarray(mfccs)
imshow(im)
(20, 39)
Out[10]:
<matplotlib.image.AxesImage at 0x1d34bb10320>
In [11]:
r, d = librosa.load(str(train_audio_path) + filename)
#print(r,d)
mfccs = librosa.feature.mfcc(y=r, sr=d, n_mfcc=20)
height, width = mfccs.shape[0], mfccs.shape[1] #in pixels
print(mfccs.shape)
tmin = np.amin(mfccs)
tmax = np.amax(mfccs)
mfccs = np.add(mfccs, -tmin, casting='unsafe')
size = width * 10, height * 10
mfccs *= 255. / (tmax - tmin - 1)
im = Image.fromarray(mfccs)
imshow(im)
(20, 39)
Out[11]:
<matplotlib.image.AxesImage at 0x1d34bb32668>
In [13]:
data = [go.Surface(z=spectrogram.T)]
layout = go.Layout(
    title='Specgtrogram of "right" in 3d',
    scene = dict(
    yaxis = dict(title='Frequencies', range=freqs),
    xaxis = dict(title='Time', range=times),
    zaxis = dict(title='Log amplitude'),
    ),
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
In [17]:
freqs, times, spectrogram_cut = log_specgram(samples, sample_rate)

fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + filename)
ax1.set_ylabel('Amplitude')
ax1.plot(samples)

ax2 = fig.add_subplot(212)
ax2.set_title('Spectrogram of ' + filename)
ax2.set_ylabel('Frequencies * 0.1')
ax2.set_xlabel('Samples')
ax2.imshow(spectrogram_cut.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
Out[17]:
<matplotlib.image.AxesImage at 0x1d34c0f8400>

FFT 필터링

In [18]:
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    vals = 2.0/N * np.abs(yf[0:N//2])  # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    return xf, vals
In [19]:
new_sample_rate = 8000

sample_rate, samples = wavfile.read(str(train_audio_path) + filename)
resampled = signal.resample(samples, int(new_sample_rate/sample_rate * samples.shape[0]))
In [20]:
ipd.Audio(samples, rate=sample_rate)
Out[20]:
In [21]:
ipd.Audio(resampled, rate=new_sample_rate)
Out[21]:
In [22]:
new_sample_rate = 16000

sample_rate, samples = wavfile.read(str(train_audio_path) + filename)
resampled = signal.resample(samples, int(new_sample_rate/sample_rate * samples.shape[0]))
In [23]:
ipd.Audio(resampled, rate=new_sample_rate)
Out[23]:
In [24]:
xf, vals = custom_fft(samples, sample_rate)
plt.figure(figsize=(12, 4))
plt.title('FFT of recording sampled with ' + str(sample_rate) + ' Hz')
plt.plot(xf, vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()
In [25]:
xf, vals = custom_fft(resampled, new_sample_rate)
plt.figure(figsize=(12, 4))
plt.title('FFT of recording sampled with ' + str(new_sample_rate) + ' Hz')
plt.plot(xf, vals)
plt.xlabel('Frequency')
plt.grid()
plt.show()