predict delta-time: training

victor-shepardson · victor-shepardson · commit 6776c2f49662 · 2022-03-02T14:15:08.000Z
diff --git a/notepredictor/notepredictor/distributions.py b/notepredictor/notepredictor/distributions.py
@@ -0,0 +1,99 @@
+import math
+
+import torch
+from torch import nn
+import torch.distributions as D
+import torch.nn.functional as F
+
+class CensoredMixturePointyBoi(nn.Module):
+    def __init__(self, n, res=1e-2, lo='-inf', hi='inf', max_sharp=1e3):
+        super().__init__()
+        self.n = n
+        self.res = res
+        self.register_buffer('max_sharp', torch.tensor(float(max_sharp)))
+        self.register_buffer('lo', torch.tensor(float(lo)))
+        self.register_buffer('hi', torch.tensor(float(hi)))
+        self.bias = nn.Parameter(torch.cat((
+            torch.zeros(n), torch.linspace(0,1,n), -torch.ones(n)
+            )))
+
+    @property
+    def n_params(self):
+        return self.n*3
+
+    def get_params(self, h):
+        assert h.shape[-1] == self.n_params
+        h = h+self.bias
+        # get parameters fron unconstrained hidden state:
+        logit_pi, loc, log_s = torch.chunk(h, 3, -1)
+        # mixture coefficients
+        log_pi = logit_pi - logit_pi.logsumexp(1,keepdim=True)
+        # sharpness
+        # s = log_s.exp()
+        s = torch.min(F.softplus(log_s), self.max_sharp)
+        return log_pi, loc, s
+
+    def forward(self, h, x):
+        """log prob of x under distribution parameterized by h"""
+        log_pi, loc, s = self.get_params(h)    
+
+        x = x.clamp(self.lo, self.hi)[...,None]
+        xp, xm = x+self.res/2, x-self.res/2
+
+        # numerical crimes follow
+
+        # truncation
+        lo_cens = x <= self.lo
+        xm_ = torch.where(lo_cens, -h.new_ones([]), (xm-loc)*s)
+        axm_ = torch.where(lo_cens, h.new_zeros([]), xm_.abs())
+        hi_cens = x >= self.hi
+        xp_ = torch.where(hi_cens, h.new_ones([]), (xp-loc)*s)
+        axp_ = torch.where(hi_cens, h.new_zeros([]), xp_.abs())
+
+        log_delta_cdf = (
+            (xp_ - xm_ + xp_*axm_ - axp_*xm_).log() 
+            # (2*self.res + xp_*axm_ - axp_*xm_).log() 
+            - (axp_ + axm_ + axp_*axm_).log1p() 
+            - math.log(2))
+        
+        # log prob
+        r = {
+            'log_prob': (log_pi + log_delta_cdf).logsumexp(-1)
+        }
+        with torch.no_grad():
+            r |= {
+                'max_sharpness': s.max(),
+                'min_sharpness': s.min(),
+                'min_entropy': D.Categorical(logits=log_pi).entropy().min(),
+                'min_entropy': D.Categorical(logits=log_pi).entropy().min(),
+                'min_loc': loc.min(),
+                'max_loc': loc.max()
+            }
+        return r
+
+    def cdf(self, h, x):
+        log_pi, loc, s = self.get_params(h)  
+        x_ = (x[...,None] - loc) * s 
+        cdfs = x_ / (1+x_.abs()) * 0.5 + 0.5
+        cdf = (cdfs * log_pi.softmax(-1)).sum(-1)
+        return cdf
+
+
+    def sample(self, h, shape=1):
+        """
+        Args:
+            shape: additional sample shape to be prepended to dims
+        """
+        # if shape is None: shape = []
+
+        log_pi, loc, s = self.get_params(h)
+        c = D.Categorical(logits=log_pi).sample((shape,))
+        # move sample dimension first
+        loc = loc.movedim(-1, 0).gather(0, c)
+        s = s.movedim(-1, 0).gather(0, c)
+
+        u = torch.rand(shape, *h.shape[:-1])*2-1
+        x_ = u / (1 - u.abs())
+        x = x_ / s + loc
+
+        return x.clamp(self.lo, self.hi)
diff --git a/notepredictor/notepredictor/model.py b/notepredictor/notepredictor/model.py
@@ -1,37 +1,65 @@
+import math
+
 import torch
 from torch import nn
 import torch.nn.functional as F
 
 from .rnn import GenericRNN
+from .distributions import CensoredMixturePointyBoi
+
+class SineEmbedding(nn.Module):
+    def __init__(self, n, f0=1e-3, interval=2):
+        super().__init__()
+        self.n = n
+        self.register_buffer('fs', f0 * interval**torch.arange(n) * 2 * math.pi)
+
+    def forward(self, x):
+        x = x[...,None] * self.fs
+        return x.sin()
 
-class PitchPredictor(nn.Module):
+class NotePredictor(nn.Module):
     # note: use named arguments only for benefit of training script
-    def __init__(self, emb_size=128, hidden_size=512, domain_size=128, 
-            num_layers=1, kind='gru', dropout=0):
+    def __init__(self, 
+            pitch_emb_size=128, time_emb_size=16, hidden_size=512,
+            num_layers=1, kind='gru', dropout=0, 
+            num_pitches=128, 
+            time_components=5, time_res=1e-2,
+            ):
         """
         """
         super().__init__()
 
-        self.start_token = domain_size-2
-        self.end_token = domain_size-1
+        self.start_token = num_pitches
+        self.end_token = num_pitches+1
 
-        self.emb = nn.Embedding(domain_size, emb_size)
-        self.proj = nn.Linear(hidden_size, domain_size)
-        #### DEBUG
-        with torch.no_grad():
-            self.proj.weight.mul_(1e-2)
+        self.pitch_domain = num_pitches+2
+
+        # TODO: upper truncation?
+        self.time_dist = CensoredMixturePointyBoi(time_components, time_res, 0, 10)
         
-        self.rnn = GenericRNN(kind, emb_size, hidden_size, 
+        # embeddings for inputs
+        self.pitch_emb = nn.Embedding(self.pitch_domain, pitch_emb_size)
+        self.time_emb = SineEmbedding(time_emb_size)
+
+        # RNN backbone
+        self.rnn = GenericRNN(kind, pitch_emb_size+time_emb_size, hidden_size, 
             num_layers=num_layers, batch_first=True, dropout=dropout)
-        
-        # learnable initial state
+
+        # learnable initial RNN state
         self.initial_state = nn.ParameterList([
              # layer x batch x hidden
             nn.Parameter(torch.randn(num_layers,1,hidden_size)*hidden_size**-0.5)
             for _ in range(2 if kind=='lstm' else 1)
         ])
 
-        # persistent state for inference
+        # projection from RNN state to distribution parameters
+        self.time_proj = nn.Linear(hidden_size, self.time_dist.n_params, bias=False)
+        self.pitch_proj = nn.Linear(hidden_size + time_emb_size, self.pitch_domain)
+        with torch.no_grad():
+            self.time_proj.weight.mul_(1e-2)
+            self.pitch_proj.weight.mul_(1e-2)
+
+        # persistent RNN state for inference
         for n,t in zip(self.cell_state_names(), self.initial_state):
             self.register_buffer(n, t.clone())
 
@@ -42,36 +70,61 @@ def cell_state_names(self):
     def cell_state(self):
         return tuple(getattr(self, n) for n in self.cell_state_names())
         
-    def forward(self, notes):
+    def forward(self, pitches, times):
         """
         Args:
-            notes: LongTensor[batch, time]
+            pitches: LongTensor[batch, time]
+            times: FloatTensor[batch, time]
         """
-        x = self.emb(notes) # batch, time, emb_size
+
+        time_emb = self.time_emb(times) # batch, time, time_emb_size
+        pitch_emb = self.pitch_emb(pitches) # batch, time, note_emb_size
+
+        x = torch.cat((pitch_emb, time_emb), -1)
         ## broadcast intial state to batch size
         initial_state = tuple(
             t.expand(self.rnn.num_layers, x.shape[0], -1).contiguous() # 1 x batch x hidden
             for t in self.initial_state)
         h, _ = self.rnn(x, initial_state) #batch, time, hidden_size
 
-        logits = self.proj(h[:,:-1]) # batch, time-1, 128
-        logits = F.log_softmax(logits, -1) # logits = logits - logits.logsumexp(-1, keepdim=True)
-        targets = notes[:,1:,None] #batch, time-1, 1
-        return {
-            'log_probs': logits.gather(-1, targets)[...,0],
-            'logits': logits
+        # RNN hidden state -> time prediction
+        time_params = self.time_proj(h[:,:-1]) # batch, time-1, time_params
+        time_targets = times[:,1:] # batch, time-1
+        time_result = self.time_dist(time_params, time_targets)
+        time_log_probs = time_result.pop('log_prob')
+
+        # RNN hidden state, time -> pitch prediction
+        # pitch_params = h[...,:self.pitch_domain] + self.pitch_bias # CI
+        pitch_params = self.pitch_proj(torch.cat((h[:,:-1], time_emb[:,1:]), -1))
+        pitch_logits = F.log_softmax(pitch_params, -1)
+        pitch_targets = pitches[:,1:,None] #batch, time-1, 1
+        pitch_log_probs = pitch_logits.gather(-1, pitch_targets)[...,0]
+
+        r = {
+            'pitch_log_probs': pitch_log_probs,
+            'time_log_probs': time_log_probs,
+            **time_result
         }
+        with torch.no_grad():
+            r['time_acc_30ms'] = (
+                self.time_dist.cdf(time_params, time_targets + 0.03)
+                - torch.where(time_targets - 0.03 >= 0,
+                    self.time_dist.cdf(time_params, time_targets - 0.03),
+                    time_targets.new_zeros([]))
+            )
+        return r
     
-    def predict(self, note, sample=True):
+    # TODO: time
+    def predict(self, note, time, sample=True):
         """
         Args:
             note: int
             sample: bool
         Returns:
-            int if `sample` else Tensor[domain_size]
+            int if `sample` else Tensor[num_notes+2]
         """
         note = torch.LongTensor([[note]]) # 1x1 (batch, time)
-        x = self.emb(note) # 1, 1, emb_size
+        x = self.note_emb(note) # 1, 1, emb_size
         
         h, new_state = self.rnn(x, self.cell_state)
         for t,new_t in zip(self.cell_state, new_state):
diff --git a/scripts/train_notes.py b/scripts/train_notes.py