サウンドプログラミング1

科目名：　メディアネットワーク実験IA（2018年～）
対象：　メディアネットワークコース3年目
日時：　6月30日（火） - 7月1日（水）13:00～18:00
場所：　M棟1階計算機室
レポート提出締切：　7月7日（火）13:00
レポート提出先：　メールにファイルを添付し、aoki@ime.ist.hokudai.ac.jpまで提出すること．
連絡先：　青木直史（情報エレクトロニクス棟6階6-07）（Tel: 011-706-6532）（E-mail: aoki@ime.ist.hokudai.ac.jp）

目的

　音はマルチメディアコンテンツを構成する重要な要素である．本実験は，Pythonによるプログラミングを通して，サウンド処理に対する理解を深めることを目的としている．

1．はじめに

　本実験は，Jupyter Notebookを利用し，下記のプログラムをブラウザを使って実行しながら進めるものとする．なお，音を確認する場合は各自のイヤフォンまたはヘッドフォンを使うこと．環境のインストールはつぎの手順のとおり．

(1) Anacondaのインストール

https://www.anaconda.com/distribution/
から「Python 3.7」のインストーラをダウンロードして実行．

(2) ライブラリのインストール

Anaconda Promptを開く．

pip install librosa
と入力して実行．

pip install --upgrade tensorflow==2.0.0
と入力して実行．

pip install keras
と入力して実行．

(3) Jupyter Notebookの起動

jupyter notebook
と入力して実行．

(4) ブラウザ（Chrome推奨）を利用してプログラミングを行う．

「New」ボタンをクリックして「Python3」を選択．

音ファイルや画像ファイルを利用するときは「Upload」ボタンをクリックし，必要なファイルをJupyter Notebookにアップロードすること．

2．モノラルの音データの入出力

　つぎのプログラムを実行し，モノラルの音データの入出力について確かめなさい．サイン波のパラメータが変化すると，音はどのように変化するか，波形，周波数特性，スペクトログラムを観察し，考察しなさい．

(1)

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

fs = 8000
length_of_s = int(fs * 1)

s = np.zeros(length_of_s)
for n in range(length_of_s):
    s[n] = 0.5 * np.sin(2 * np.pi * 1000 * n / fs)

plt.figure()
plt.plot(s)
plt.axis([0, 50, -1, 1])
plt.savefig('01.png')

(3)

wavfile.write('ex01.wav', fs, (s * 32768).astype(np.int16))

(4)

Audio('ex01.wav')

(5)

fs, s = wavfile.read('ex01.wav')

s = s.astype(np.float) / 32768;

plt.figure()
plt.plot(s)
plt.axis([0, 50, -1, 1])
plt.savefig('02.png')

(6)

def Hanning_window(N):
    w = np.zeros(N)
    if N % 2 == 0:
        for n in range(N):
            w[n] = 0.5 - 0.5 * np.cos(2 * np.pi * n / N)
    else:
        for n in range(N):
            w[n] = 0.5 - 0.5 * np.cos(2 * np.pi * (n + 0.5) / N)
    return w

(7)

N = 1024

w = Hanning_window(N)
S = np.fft.fft(s[0 : N] * w)

f = np.zeros(N)
for k in range(N):
    f[k] = fs * k / N

plt.figure()
plt.plot(f[0 : int(N / 2)], np.abs(S[0 : int(N / 2)]))
plt.xlim([0, int(fs / 2)])
plt.xlabel("frequency [Hz]")
plt.ylabel("amplitude")
plt.savefig('03.png')

(8)

N = 512
shift_size = 64
length_of_s = len(s)
number_of_frame = int((length_of_s - N) / shift_size)

x = np.zeros(N)
w = Hanning_window(N)
S = np.zeros((int(N / 2 + 1), number_of_frame))

for frame in range(number_of_frame):
    offset = shift_size * frame
    for n in range(N):
        x[n] = s[offset + n] * w[n]

    X = np.fft.fft(x)
    X_abs = np.abs(X)

    for k in range(int(N / 2 + 1)):
        S[k, frame] = 20 * np.log10(X_abs[k] + 1);

plt.figure()
xmin = (N / 2) / fs
xmax = (shift_size * (number_of_frame - 1) + N / 2)/ fs
ymin = 0
ymax = fs / 2
plt.imshow(S, aspect = 'auto', cmap = 'Greys', origin='lower', vmin = 0, vmax = 20, extent = [xmin, xmax, ymin, ymax])
plt.xlim([0, length_of_s / fs])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
plt.savefig('04.png')

3．ステレオの音データの入出力

　つぎのプログラムを実行し，ステレオの音データの入出力について確かめなさい．音像が左右に移動して聞こえるのはなぜか，考察しなさい．

(1)

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

fs = 8000
length_of_s = int(fs * 4)

sL = np.zeros(length_of_s)
sR = np.zeros(length_of_s)
for n in range(length_of_s):
    sL[n] = 0.5 * np.sin(2 * np.pi * 1000 * n / fs) * (0.5 + np.cos(2 * np.pi * 0.25 * n / fs) * 0.5)
    sR[n] = 0.5 * np.sin(2 * np.pi * 1000 * n / fs) * (0.5 - np.cos(2 * np.pi * 0.25 * n / fs) * 0.5)

plt.figure()
plt.plot(sL)
plt.axis([0, int(fs * 4), -1, 1])
plt.savefig('01.png')

plt.figure()
plt.plot(sR)
plt.axis([0, int(fs * 4), -1, 1])
plt.savefig('02.png')

(3)

s = np.zeros((length_of_s, 2))

s[:, 0] = sL
s[:, 1] = sR

wavfile.write('ex02.wav', fs, (s * 32768).astype(np.int16))

(4)

Audio('ex02.wav')

(5)

fs, s = wavfile.read('ex02.wav')

s = s.astype(np.float) / 32768;

sL = s[:, 0]
sR = s[:, 1]

plt.figure()
plt.plot(sL)
plt.axis([0, int(fs * 4), -1, 1])
plt.savefig('03.png')

plt.figure()
plt.plot(sR)
plt.axis([0, int(fs * 4), -1, 1])
plt.savefig('04.png')

(6)

def Hanning_window(N):
    w = np.zeros(N)
    if N % 2 == 0:
        for n in range(N):
            w[n] = 0.5 - 0.5 * np.cos(2 * np.pi * n / N)
    else:
        for n in range(N):
            w[n] = 0.5 - 0.5 * np.cos(2 * np.pi * (n + 0.5) / N)
    return w

(7)

N = 1024

w = Hanning_window(N)
SL = np.fft.fft(sL[0 : N] * w)
SR = np.fft.fft(sR[0 : N] * w)

f = np.zeros(N)
for k in range(N):
    f[k] = fs * k / N

plt.figure()
plt.plot(f[0 : int(N / 2)], np.abs(SL[0 : int(N / 2)]))
plt.xlim([0, int(fs / 2)])
plt.xlabel("frequency [Hz]")
plt.ylabel("amplitude")
plt.savefig('05.png')

plt.figure()
plt.plot(f[0 : int(N / 2)], np.abs(SR[0 : int(N / 2)]))
plt.xlim([0, int(fs / 2)])
plt.xlabel("frequency [Hz]")
plt.ylabel("amplitude")
plt.savefig('06.png')

(8)

N = 512
shift_size = 64
length_of_s = len(s)
number_of_frame = int((length_of_s - N) / shift_size)

x = np.zeros(N)
w = Hanning_window(N)
SL = np.zeros((int(N / 2 + 1), number_of_frame))
SR = np.zeros((int(N / 2 + 1), number_of_frame))

for frame in range(number_of_frame):
    offset = shift_size * frame
    for n in range(N):
        x[n] = sL[offset + n] * w[n]

    X = np.fft.fft(x)
    X_abs = np.abs(X)

    for k in range(int(N / 2 + 1)):
        SL[k, frame] = 20 * np.log10(X_abs[k] + 1);

for frame in range(number_of_frame):
    offset = shift_size * frame
    for n in range(N):
        x[n] = sR[offset + n] * w[n]

    X = np.fft.fft(x)
    X_abs = np.abs(X)

    for k in range(int(N / 2 + 1)):
        SR[k, frame] = 20 * np.log10(X_abs[k] + 1);

plt.figure()
xmin = (N / 2) / fs
xmax = (shift_size * (number_of_frame - 1) + N / 2)/ fs
ymin = 0
ymax = fs / 2
plt.imshow(SL, aspect = 'auto', cmap = 'Greys', origin='lower', vmin = 0, vmax = 20, extent = [xmin, xmax, ymin, ymax])
plt.xlim([0, length_of_s / fs])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
plt.savefig('07.png')

plt.figure()
xmin = (N / 2) / fs
xmax = (shift_size * (number_of_frame - 1) + N / 2)/ fs
ymin = 0
ymax = fs / 2
plt.imshow(SR, aspect = 'auto', cmap = 'Greys', origin='lower', vmin = 0, vmax = 20, extent = [xmin, xmax, ymin, ymax])
plt.xlim([0, length_of_s / fs])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
plt.savefig('08.png')

4．音響合成

　つぎのプログラムを実行し，楽器音の特徴を観察しなさい．また，こうした音の特徴を再現することで楽器音をつくりなさい．

(1)

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

Audio('marimba.wav')

(3)

fs, s = wavfile.read('marimba.wav')

s = s.astype(np.float) / 32768;

plt.figure()
plt.plot(s)
#plt.axis([0, 50, -1, 1])
plt.savefig('01.png')

(4)

def Hanning_window(N):
    w = np.zeros(N)
    if N % 2 == 0:
        for n in range(N):
            w[n] = 0.5 - 0.5 * np.cos(2 * np.pi * n / N)
    else:
        for n in range(N):
            w[n] = 0.5 - 0.5 * np.cos(2 * np.pi * (n + 0.5) / N)
    return w

(5)

N = 1024

w = Hanning_window(N)
S = np.fft.fft(s[0 : N] * w)

f = np.zeros(N)
for k in range(N):
    f[k] = fs * k / N

plt.figure()
plt.plot(f[0 : int(N / 2)], np.abs(S[0 : int(N / 2)]))
plt.xlim([0, int(fs / 2)])
plt.xlabel("frequency [Hz]")
plt.ylabel("amplitude")
plt.savefig('02.png')

(6)

N = 512
shift_size = 64
length_of_s = len(s)
number_of_frame = int((length_of_s - N) / shift_size)

x = np.zeros(N)
w = Hanning_window(N)
S = np.zeros((int(N / 2 + 1), number_of_frame))

for frame in range(number_of_frame):
    offset = shift_size * frame
    for n in range(N):
        x[n] = s[offset + n] * w[n]

    X = np.fft.fft(x)
    X_abs = np.abs(X)

    for k in range(int(N / 2 + 1)):
        S[k, frame] = 20 * np.log10(X_abs[k] + 1);

plt.figure()
xmin = (N / 2) / fs
xmax = (shift_size * (number_of_frame - 1) + N / 2)/ fs
ymin = 0
ymax = fs / 2
plt.imshow(S, aspect = 'auto', cmap = 'Greys', origin='lower', vmin = 0, vmax = 20, extent = [xmin, xmax, ymin, ymax])
plt.xlim([0, length_of_s / fs])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
plt.savefig('03.png')

(7)

def ADSR(fs, A, D, S, R, gate, duration):
    A = int(fs * A)
    D = int(fs * D)
    R = int(fs * R)
    gate = int(fs * gate)
    duration = int(fs * duration)
    e = np.zeros(duration)
    if A != 0:
        for n in range(A):
            e[n] = 1.0 - np.exp(-5.0 * n / A)

    if D != 0:
        for n in range(A, gate):
            e[n] = S + (1.0 - S) * np.exp(-5.0 * (n - A) / D)

    else:
        for n in range(A, gate):
            e[n] = S

    if R != 0:
        for n in range(gate, duration):
            e[n]= e[gate - 1] * np.exp(-5.0 * (n - gate + 1) / R)

    return e

(8)

fs = 44100

A = 0.1
D = 0.4
S = 0.5
R = 0.4
gate = 1
duration = 2
offset = 0
depth = 1
vca = ADSR(fs, A, D, S, R, gate, duration)
for n in range(int(fs * duration)):
    vca[n] = offset + vca[n] * depth

plt.figure()
plt.plot(vca)

(9)

note_number = 72
f0 = 440 * np.power(2, (note_number - 69) / 12)

(10)

length_of_s = int(fs * 2)

s = np.zeros(length_of_s)
p = np.zeros(length_of_s)

for n in range(length_of_s):
    p[n] = np.sin(2 * np.pi * f0 * n / fs)
    
vca_A = 0.05
vca_D = 0.8
vca_S = 0
vca_R = 0.8
vca_gate = 0.8
vca_duration = 2
vca_offset = 0
vca_depth = 1
vca = ADSR(fs, vca_A, vca_D, vca_S, vca_R, vca_gate, vca_duration)
for n in range(length_of_s):
    vca[n] = vca_offset + vca[n] * vca_depth
for n in range(length_of_s):
    p[n] *= vca[n] * 0.3
for n in range(length_of_s):
    s[n] += p[n]

for n in range(length_of_s):
    p[n] = np.sin(2 * np.pi * f0 * 4 * n / fs)
    
vca_A = 0.05
vca_D = 0.2
vca_S = 0
vca_R = 0.2
vca_gate = 0.2
vca_duration = 2
vca_offset = 0
vca_depth = 1
vca = ADSR(fs, vca_A, vca_D, vca_S, vca_R, vca_gate, vca_duration)
for n in range(length_of_s):
    vca[n] = vca_offset + vca[n] * vca_depth
for n in range(length_of_s):
    p[n] *= vca[n] * 1.0
for n in range(length_of_s):
    s[n] += p[n]

for n in range(length_of_s):
    p[n] = np.sin(2 * np.pi * f0 * 10 * n / fs)
    
vca_A = 0.05
vca_D = 0.1
vca_S = 0
vca_R = 0.1
vca_gate = 0.1
vca_duration = 2
vca_offset = 0
vca_depth = 1
vca = ADSR(fs, vca_A, vca_D, vca_S, vca_R, vca_gate, vca_duration)
for n in range(length_of_s):
    vca[n] = vca_offset + vca[n] * vca_depth
for n in range(length_of_s):
    p[n] *= vca[n] * 0.05
for n in range(length_of_s):
    s[n] += p[n]

s = s / max(abs(s)) * 0.9
    
plt.figure()
plt.plot(s)
#plt.axis([0, 50, -1, 1])
plt.savefig('04.png')

(11)

wavfile.write('ex03.wav', fs, (s * 32768).astype(np.int16))

(12)

Audio('ex03.wav')

(13)

N = 1024

w = Hanning_window(N)
S = np.fft.fft(s[0 : N] * w)

f = np.zeros(N)
for k in range(N):
    f[k] = fs * k / N

plt.figure()
plt.plot(f[0 : int(N / 2)], np.abs(S[0 : int(N / 2)]))
plt.xlim([0, int(fs / 2)])
plt.xlabel("frequency [Hz]")
plt.ylabel("amplitude")
plt.savefig('05.png')

(14)

N = 512
shift_size = 64
length_of_s = len(s)
number_of_frame = int((length_of_s - N) / shift_size)

x = np.zeros(N)
w = Hanning_window(N)
S = np.zeros((int(N / 2 + 1), number_of_frame))

for frame in range(number_of_frame):
    offset = shift_size * frame
    for n in range(N):
        x[n] = s[offset + n] * w[n]

    X = np.fft.fft(x)
    X_abs = np.abs(X)

    for k in range(int(N / 2 + 1)):
        S[k, frame] = 20 * np.log10(X_abs[k] + 1);

plt.figure()
xmin = (N / 2) / fs
xmax = (shift_size * (number_of_frame - 1) + N / 2)/ fs
ymin = 0
ymax = fs / 2
plt.imshow(S, aspect = 'auto', cmap = 'Greys', origin='lower', vmin = 0, vmax = 20, extent = [xmin, xmax, ymin, ymax])
plt.xlim([0, length_of_s / fs])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
plt.savefig('06.png')

5．音源分離

　ステレオの音データに対して，つぎのプログラムを実行すると，ボーカルの歌声を取り除き，伴奏の音源をつくることができる．こうしたボーカルキャンセラの原理について考察しなさい．

(1)

%matplotlib inline
import math
import wave
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from scipy import fftpack
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

Audio('stereo01.wav')

(3)

fs, s0 = wavfile.read('stereo01.wav')

s0 = s0.astype(np.float) / 32768;

length_of_s = len(s0)

s0L = s0[:, 0]
s0R = s0[:, 1]

plt.figure()
plt.plot(s0L)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('01.png')

plt.figure()
plt.plot(s0R)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('02.png')

(4)

def Hanning_window(N):
    w = np.zeros(N)
    if N % 2 == 0:
        for n in range(N):
            w[n] = 0.5 - 0.5 * math.cos(2 * math.pi * n / N)
    else:
        for n in range(N):
            w[n] = 0.5 - 0.5 * math.cos(2 * math.pi * (n + 0.5) / N)
    return w

(5)

s1L = np.zeros(length_of_s)
s1R = np.zeros(length_of_s)

frame_size = 4096
shift_size = int(frame_size / 2)
number_of_frame = int((length_of_s - frame_size) / shift_size)

xL = np.zeros(frame_size)
xR = np.zeros(frame_size)
w = Hanning_window(frame_size)

dft_size = frame_size
fmin = round(100 * dft_size / fs) # 100 Hz
fmax = round(12000 * dft_size / fs) # 12 kHz

for frame in range(number_of_frame):
    
    offset = shift_size * frame
    
    for n in range(frame_size):
        xL[n] = s0L[offset + n] * w[n]
        xR[n] = s0R[offset + n] * w[n]
    
    XL = np.fft.fft(xL)
    XL_abs = np.abs(XL)
    XL_angle = np.angle(XL)
    
    XR = np.fft.fft(xR)
    XR_abs = np.abs(XR)
    XR_angle = np.angle(XR)
    
    for k in range(fmin, fmax):
        if XL_abs[k] + XR_abs[k] != 0:
            d = (XL_abs[k] - XR_abs[k]) / (XL_abs[k] + XR_abs[k])
            if d * d < 0.01: # if d * d > 0.01
                XL_abs[k] = 0.00001
                XL_abs[dft_size - k] = XL_abs[k]
                XR_abs[k] = 0.00001
                XR_abs[dft_size - k] = XR_abs[k]
    
    YL = XL_abs * np.exp(1j * XL_angle)
    yL = np.fft.ifft(YL)
    yL = np.real(yL)
    
    YR = XR_abs * np.exp(1j * XR_angle)
    yR = np.fft.ifft(YR)
    yR = np.real(yR)
    
    for n in range(frame_size):
        s1L[offset + n] += yL[n]
        s1R[offset + n] += yR[n]

(6)

plt.figure()
plt.plot(s1L)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('03.png')

plt.figure()
plt.plot(s1R)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('04.png')

(6)

s1 = np.zeros((length_of_s, 2))

s1[:, 0] = s1L
s1[:, 1] = s1R

wavfile.write('ex04.wav', fs, (s1 * 32768).astype(np.int16))

(8)

Audio('ex04.wav')

6．テンポ推定

　音楽の要素のひとつにリズムがある．つぎのプログラムを実行し，BPM（Beat Per Minute）を単位として，楽曲のテンポを推定しなさい．

(1)

%matplotlib inline
import math
import wave
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from scipy import fftpack
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

Audio('153bpm.wav')

(3)

fs, s = wavfile.read('153bpm.wav')

s = s.astype(np.float) / 32768;

length_of_s = len(s)

sL = s[:, 0]
sR = s[:, 1]

plt.figure()
plt.plot(sL)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('01.png')

plt.figure()
plt.plot(sR)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('02.png')

(4)

s = (sL + sR) / 2;

plt.figure()
plt.plot(s)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('03.png')

(5)

frame_size = 441
number_of_frame = int(length_of_s / frame_size)

loudness = np.zeros(number_of_frame)
delta_loudness = np.zeros(number_of_frame)

for frame in range(number_of_frame):
    offset = frame_size * frame
    for n in range(frame_size):
        loudness[frame] += s[offset + n] * s[offset + n]

delta_loudness[0] = 0;
for frame in range(1, number_of_frame):
    delta_loudness[frame] = loudness[frame] - loudness[frame - 1]
    if delta_loudness[frame] < 0:
        delta_loudness[frame] = 0

plt.figure()
plt.plot(loudness)
#plt.axis([0, length_of_s, 0, 16])
plt.savefig('04.png')

plt.figure()
plt.plot(delta_loudness)
#plt.axis([0, length_of_s, 0, 16])
plt.savefig('05.png')

(6)

bmin = 60;
bmax = 240;

r = np.zeros(bmax)

for b in range(bmin, bmax):
    u = 0
    v = 0
    for frame in range(1, number_of_frame):
        u += delta_loudness[frame] * math.sin(2 * math.pi * (b / 60) * (frame_size * frame / fs))
        v += delta_loudness[frame] * math.cos(2 * math.pi * (b / 60) * (frame_size * frame / fs))
        
    r[b] = math.sqrt(u * u + v * v)
    
plt.figure()
plt.plot(r)
#plt.axis([0, bmax, 0, 16])
plt.savefig('06.png')

(7)

rmax = 0
tempo = bmin

for b in range(bmin, bmax):
    if r[b] > rmax:
        rmax = r[b]
        tempo = b
        
print(tempo)

7．コード推定

　音楽の要素のひとつにハーモニーがある．つぎのプログラムを実行し，楽曲のコードを推定しなさい．なお，このプログラムは，2.5秒の時間間隔で楽曲のコードを推定するものになっている．

(1)

%matplotlib inline
import math
import wave
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from scipy import fftpack
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

import librosa
import librosa.display

(3)

Audio('stereo02.wav')

(4)

fs, s = wavfile.read('stereo02.wav')

s = s.astype(np.float) / 32768;

length_of_s = len(s)

sL = s[:, 0]
sR = s[:, 1]

plt.figure()
plt.plot(sL)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('01.png')

plt.figure()
plt.plot(sR)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('02.png')

(5)

s = (sL + sR) / 2;

plt.figure()
plt.plot(s)
plt.axis([0, length_of_s, -1, 1])
plt.savefig('03.png')

(6)

h, p = librosa.effects.hpss(s)

(7)

plt.figure()
plt.plot(h)
plt.savefig('04.png')

plt.figure()
plt.plot(p)
plt.savefig('05.png')

(8)

wavfile.write('harmonic.wav', fs, (h * 32768).astype(np.int16))
wavfile.write('percussive.wav', fs, (p * 32768).astype(np.int16))

(9)

Audio('harmonic.wav')

(10)

Audio('percussive.wav')

(11)

chroma = librosa.feature.chroma_cens(y = h, sr = fs, hop_length = 512, n_octaves= 7, n_chroma= 12)

(12)

plt.figure(figsize = (12, 4)) 
librosa.display.specshow(chroma, sr = fs, x_axis = 'time', y_axis = 'chroma', vmin = 0, vmax = 1) 
plt.title('Chromagram') 
plt.colorbar() 
plt.tight_layout()
plt.savefig('06.png')

(13)

from collections import OrderedDict
chord = OrderedDict() 
chord['C'] = [1 / 3, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 0] 
chord['Db'] = [0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0] 
chord['D'] = [0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0] 
chord['Eb'] = [0, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0] 
chord['E'] = [0, 0, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 1 / 3] 
chord['F'] = [1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0] 
chord['Gb'] = [0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0] 
chord['G'] = [0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 0, 1 / 3] 
chord['Ab'] = [1 / 3, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 0] 
chord['A'] = [0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0] 
chord['Bb'] = [0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0] 
chord['B'] = [0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3] 
chord['Cm'] = [1 / 3, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 0, 0] 
chord['Dbm'] = [0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 0] 
chord['Dm'] = [0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0] 
chord['Ebm'] = [0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0] 
chord['Em'] = [0, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0, 1 / 3] 
chord['Fm'] = [1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0, 0] 
chord['Gbm'] = [0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0, 0] 
chord['Gm'] = [0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 1 / 3, 0] 
chord['Abm'] = [0, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0, 1 / 3] 
chord['Am'] = [1 / 3, 0, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0, 0] 
chord['Bbm'] = [0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3, 0] 
chord['Bm'] = [0, 0, 1 / 3, 0, 0, 0, 1 / 3, 0, 0, 0, 0, 1 / 3]

(14)

frame_size = 512
number_of_frame = chroma.shape[1]
section_size = fs / frame_size * 2.5 # 2.5 second
number_of_section = int(number_of_frame / section_size)

chord_progression = []
sum_of_chroma = np.zeros(12)

for section in range(number_of_section):
    offset = int(section_size * section)
    for frame in range(int(section_size)):
        for tone in range(12):
            sum_of_chroma[tone] += chroma[tone][offset + frame]
            
    max_of_similarity = -1
    for i, (name, template) in enumerate(chord.items()):
        similarity = np.dot(sum_of_chroma, template) / (np.linalg.norm(sum_of_chroma) * np.linalg.norm(template))
        if (similarity > max_of_similarity):
            max_of_similarity = similarity
            esimated_chord = name
            
    chord_progression.append(esimated_chord)
    sum_of_chroma = np.zeros(12)

(15)

print(chord_progression)

8．パターンプレイバック

　つぎのプログラムを実行し，スペクトログラムから音を復元するパターンプレイバックの動作について確かめなさい．パターンプレイバックの原理について，考察しなさい．

(1)

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from IPython.display import display, Audio

(2)

from PIL import Image
import numpy as np

(3)

S = np.array(Image.open('samples.jpg'))
plt.imshow(S)
plt.gray()

print(S.shape)

(4)

fs = 16000
bits = 16

(5)

N = 160
number_of_frame = 240

f0 = round(fs / N);
T = round(fs / f0);

(6)

A = np.zeros((80, 240))

for frame in range(number_of_frame):
    for k in range(int(N / 2)):
        A[k, frame] = 255 - S[int(N / 2) - k - 1, frame]

plt.imshow(A)
plt.gray()

(7)

length_of_s = int(T * number_of_frame)

s = np.zeros(length_of_s)

(8)

for frame in range(number_of_frame):
    offset = T * frame;
    for k in range(int(N / 2)):
        for n in range(T):
            s[offset + n] = s[offset + n] + A[k, frame] * np.cos(2 * np.pi * k * f0 * n / fs)

(9)

s = s / max(abs(s)) * 0.9

plt.plot(s)

(10)

wavfile.write('samples.wav', fs, (s * 32768).astype(np.int16))

(11)

Audio('samples.wav')

9．レポートについて

　下記の課題に挑戦し，レポートを作成しなさい．課題1は必須とする．課題2～5は選択課題であり，ひとつを選択して挑戦しなさい．課題6～7は発展課題であり，ひとつを選択して挑戦しなさい．

（課題1）（必修）　各自，ウェブサイトなどから楽器音の音データを入手し，実際の音をお手本とすることで，音をつくりなさい．レポートには，どのような特徴に着目して音をつくったか説明すること．なお，プログラムおよび作成した音データはメールで提出しなさい．

（課題2）（選択）　各自，ウェブサイトなどから音楽データを入手し，ボーカルキャンセラのプログラムを実行しなさい．いくつかの音楽データを試し，正しくない結果になる場合を調査することで，あたえられたプログラムにはどのような問題があるか発見し，レポートにまとめなさい．さらに性能のよいボーカルキャンセラのプログラムをつくるにはどうしたらよいか，実際にプログラムをつくって考察しなさい．

（課題3）（選択）　各自，ウェブサイトなどから音楽データを入手し，テンポ推定のプログラムを実行しなさい．いくつかの音楽データを試し，正しくない結果になる場合を調査することで，あたえられたプログラムにはどのような問題があるか発見し，レポートにまとめなさい．さらに性能のよいテンポ推定のプログラムをつくるにはどうしたらよいか，実際にプログラムをつくって考察しなさい．

（課題4）（選択）　各自，ウェブサイトなどから音楽データを入手し，コード推定のプログラムを実行しなさい．いくつかの音楽データを試し，正しくない結果になる場合を調査することで，あたえられたプログラムにはどのような問題があるか発見し，レポートにまとめなさい．さらに性能のよいコード推定のプログラムをつくるにはどうしたらよいか，実際にプログラムをつくって考察しなさい．

（課題5）（選択）　あたえられたパターンプレイバックのプログラムにはどのような問題があるか発見し，レポートにまとめなさい．さらに性能のよいパターンプレイバックのプログラムをつくるにはどうしたらよいか，実際にプログラムをつくって考察しなさい．

（課題6）（発展）　音楽データを，Aメロ，Bメロ，サビなどのセクションに分割するセグメンテーション技術は，音楽データの構造を理解するための第一歩になるとともに，楽曲検索などのアプリケーションを開発するための要素技術にもなっている．こうしたセグメンテーションのしくみについて，実際にプログラムをつくって考察しなさい．（キーワード検索のヒント：self similarity matrix）

（課題7）（発展）　機械学習の飛躍的な進歩によって，近年，音楽情報処理の分野では，さまざまな可能性が広がってきている．機械学習の具体的な手法について調査し，音楽情報処理の分野ではどのような応用が考えられるか，実際にプログラムをつくって考察しなさい．（キーワード検索のヒント：LSTM，CNN）

参考文献

青木直史, ``ゼロからはじめる音響学,'' 講談社, 2014.

青木直史, ``サウンドプログラミング入門 - 音響合成の基本とC言語による実装 - ,'' 技術評論社, 2013.

青木直史, ``C言語ではじめる音のプログラミング - サウンドエフェクトの信号処理 - ,'' オーム社, 2008.

青木直史, ``ディジタル・サウンド処理入門 - 音のプログラミングとMATLAB（Octave・Scilab）における実際 - ,'' CQ出版社, 2006.

Last Modified: June 1 12:00 JST 2020 by Naofumi Aoki
E-mail: aoki@ime.ist.hokudai.ac.jp