ultimate_midi_classifier.py

# -*- coding: utf-8 -*-
"""Ultimate_MIDI_Classifier.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/github/asigalov61/Ultimate-MIDI-Classifier/blob/main/Ultimate_MIDI_Classifier.ipynb

# Ultimate MIDI Classifier (ver. 1.0)

***

Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools

***

WARNING: This complete implementation is a functioning model of the Artificial Intelligence. Please excercise great humility, care, and respect. https://www.nscai.gov/

***

#### Project Los Angeles

#### Tegridy Code 2024

***

# (SETUP ENVIRONMENT)
"""

#@title Install all dependencies
!git clone --depth 1 https://github.com/asigalov61/Ultimate-MIDI-Classifier
!pip install einops
!pip install torch-summary
!apt install fluidsynth

# Commented out IPython magic to ensure Python compatibility.
# @title Import modules

print('=' * 70)
print('Loading modules...')
print('=' * 70)

import os
import statistics
import re
import tqdm

import torch

# %cd /content/Ultimate-MIDI-Classifier

import TMIDIX

from x_transformer_1_23_2 import *

from midi_to_colab_audio import midi_to_colab_audio

# %cd /content/

import random

from torchsummary import summary
import matplotlib.pyplot as plt
from sklearn import metrics

from huggingface_hub import hf_hub_download

from IPython.display import Audio, display

print('=' * 70)
print('Done')
print('=' * 70)
print('Torch version:', torch.__version__)
print('=' * 70)

"""# (LOAD LABELS AND FUNCTIONS)"""

# @title Load Ultimate MIDI Classifier labels and helper functions

#===============================================================================
# Helper functions
#===============================================================================

def str_strip_song(string):
  if string is not None:
    string = string.replace('-', ' ').replace('_', ' ').replace('=', ' ')
    str1 = re.compile('[^a-zA-Z ]').sub('', string)
    return re.sub(' +', ' ', str1).strip().title()
  else:
    return ''

def str_strip_artist(string):
  if string is not None:
    string = string.replace('-', ' ').replace('_', ' ').replace('=', ' ')
    str1 = re.compile('[^0-9a-zA-Z ]').sub('', string)
    return re.sub(' +', ' ', str1).strip().title()
  else:
    return ''

def song_artist_to_song_artist_tokens(file_name):
    idx = classifier_labels.index(file_name)

    tok1 = idx // 424
    tok2 = idx % 424

    return [tok1, tok2]

def song_artist_tokens_to_song_artist(file_name_tokens):

    tok1 = file_name_tokens[0]
    tok2 = file_name_tokens[1]

    idx = (tok1 * 424) + tok2

    return classifier_labels[idx]

#===============================================================================

print('=' * 70)
print('Loading Ultimate MIDI Classifier labels...')
print('=' * 70)
classifier_labels = TMIDIX.Tegridy_Any_Pickle_File_Reader('/content/Ultimate-MIDI-Classifier/Data/Ultimate_MIDI_Classifier_Song_Artist_Labels')
print('=' * 70)
genre_labels = TMIDIX.Tegridy_Any_Pickle_File_Reader('/content/Ultimate-MIDI-Classifier/Data/Ultimate_MIDI_Classifier_Music_Genre_Labels')
genre_labels_fnames = [f[0] for f in genre_labels]
print('=' * 70)
print('Done!')
print('=' * 70)

"""# (LOAD MODEL)"""

#@title Load Ultimate MIDI Classifier Pre-Trained Model

model_precision = "bfloat16" # @param ["bfloat16", "float16", "float32"]
plot_tokens_embeddings = True # @param {type:"boolean"}

print('=' * 70)
print('Setting-up Ultimate MIDI Classifier model...')
print('Please wait...')
print('=' * 70)

model_path = '/content/Ultimate-MIDI-Classifier/Model/Ultimate_MIDI_Classifier_Trained_Model_29886_steps_0.556_loss_0.8339_acc.pth'

if os.path.isfile(model_path):
  print('Model already exists...')

else:
  hf_hub_download(repo_id='asigalov61/Ultimate-MIDI-Classifier',
                  filename='Ultimate_MIDI_Classifier_Trained_Model_29886_steps_0.556_loss_0.8339_acc.pth',
                  local_dir='/content/Ultimate-MIDI-Classifier/Model',
                  )

print('=' * 70)
print('Instantiating model...')

device_type = 'cuda'

if model_precision == 'bfloat16' and torch.cuda.is_bf16_supported():
  dtype = 'bfloat16'
else:
  dtype = 'float16'

if model_precision == 'float16':
  dtype = 'float16'

if model_precision == 'float32':
  dtype = 'float32'

ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype)

SEQ_LEN = 1026 # Models seq len (must be divisible by 4)
PAD_IDX = 940 # Models pad index

# instantiate the model

model = TransformerWrapper(
    num_tokens = PAD_IDX+1,
    max_seq_len = SEQ_LEN,
    attn_layers = Decoder(dim = 1024, depth = 24, heads = 32, attn_flash = True)
)

model = AutoregressiveWrapper(model, ignore_index=PAD_IDX, pad_value=PAD_IDX)

model = torch.nn.DataParallel(model)

model.cuda()
print('=' * 70)

print('Loading model checkpoint...')

model.load_state_dict(torch.load(model_path))
print('=' * 70)

model.eval()

print('Done!')
print('=' * 70)

print('Model will use', dtype, 'precision...')
print('=' * 70)

# Model stats
print('Model summary...')
summary(model)

if plot_tokens_embeddings:

  tok_emb = model.module.net.token_emb.emb.weight.detach().cpu().tolist()

  cos_sim = metrics.pairwise_distances(
    tok_emb, metric='cosine'
  )
  plt.figure(figsize=(7, 7))
  plt.imshow(cos_sim, cmap="inferno", interpolation="nearest")
  im_ratio = cos_sim.shape[0] / cos_sim.shape[1]
  plt.colorbar(fraction=0.046 * im_ratio, pad=0.04)
  plt.xlabel("Position")
  plt.ylabel("Position")
  plt.tight_layout()
  plt.plot()
  plt.savefig("/content/Ultimate-MIDI-Classifier-Tokens-Embeddings-Plot.png", bbox_inches="tight")

"""# (LOAD SOURCE MIDI)"""

# @title Load a MIDI file to classify
full_path_to_MIDI_file = "/content/Ultimate-MIDI-Classifier/Seeds/Come To My Window.mid" # @param {type:"string"}

print('=' * 70)
print('Loading MIDI file...')

midi_name = os.path.basename(full_path_to_MIDI_file).split('.')[0]

raw_score = TMIDIX.midi2single_track_ms_score(full_path_to_MIDI_file)

#===============================================================================
# Enhanced score notes

escore_notes = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True)[0]

if len(escore_notes) > 0:

    #=======================================================
    # PRE-PROCESSING

    #===============================================================================
    # Augmented enhanced score notes

    escore_notes = TMIDIX.augment_enhanced_score_notes(escore_notes, timings_divider=32)

    escore_notes = [e for e in escore_notes if e[6] < 80 or e[6] == 128]

    #=======================================================
    # Augmentation

    #=======================================================
    # FINAL PROCESSING

    melody_chords = []

    #=======================================================
    # MAIN PROCESSING CYCLE
    #=======================================================

    pe = escore_notes[0]

    pitches = []

    notes_counter = 0

    for e in escore_notes:

        #=======================================================
        # Timings...

        delta_time = max(0, min(127, e[1]-pe[1]))

        if delta_time != 0:
            pitches = []

        # Durations and channels

        dur = max(1, min(127, e[2]))

        # Patches
        pat = max(0, min(128, e[6]))

        # Pitches

        if pat == 128:
            ptc = max(1, min(127, e[4]))+128
        else:
            ptc = max(1, min(127, e[4]))

        #=======================================================
        # FINAL NOTE SEQ

        # Writing final note synchronously

        if ptc not in pitches:
            melody_chords.extend([delta_time, dur+128, ptc+256])
            pitches.append(ptc)
            notes_counter += 1

        pe = e

#==============================================================

print('Done!')
print('=' * 70)
print('Composition has', notes_counter, 'notes')
print('=' * 70)

"""# (CLASSIFY)"""

# @title Classify MIDI

# @markdown You can stop classification at any time to render partial results
classification_sampling_resolution = 2 # @param {type:"slider", min:1, max:5, step:1}

print('=' * 70)
print('Ultimate MIDI Classifier')
print('=' * 70)

print('Input MIDI file name:', midi_name)
print('=' * 70)
print('Sampling score...')

chunk_size = 1020

score = melody_chords

input_data = []

for i in range(0, len(score)-chunk_size, chunk_size // classification_sampling_resolution):
    schunk = score[i:i+chunk_size]

    if len(schunk) == chunk_size:

        td = [937]

        td.extend(schunk)

        td.extend([938])

        input_data.append(td)

print('Done!')
print('=' * 70)
print('Composition was split into' , len(input_data), 'samples', 'of 340 notes each with', 340 - chunk_size // classification_sampling_resolution // 3, 'notes overlap')
print('=' * 70)
print('Number of notes in all composition samples:', len(input_data) * 340)
print('=' * 70)

#==============================================================

print('Classifying...')
print('=' * 70)

torch.cuda.empty_cache()

model.eval()

artist_results = []
song_results = []

results = []

for input in tqdm.tqdm(input_data):

  try:

    x = torch.tensor(input[:1022], dtype=torch.long, device='cuda')

    with ctx:
      out = model.module.generate(x,
                                  2,
                                  filter_logits_fn=top_k,
                                  filter_kwargs={'k': 1},
                                  temperature=0.9,
                                  return_prime=False,
                                  verbose=False)

    result = tuple(out[0].tolist())

    results.append(result)

  except KeyboardInterrupt:
    print('Stopping...')
    break

  except Exception as ex:
    print('Error!')
    print(ex)
    break

final_result = statistics.mode(results)

print('=' * 70)
print('Done!')
print('=' * 70)

result_toks = [final_result[0]-512, final_result[1]-512]
song_artist = song_artist_tokens_to_song_artist(result_toks)
gidx = genre_labels_fnames.index(song_artist)
genre = genre_labels[gidx][1]

print('Most common classification genre label:', genre)
print('Most common classification song-artist label:', song_artist)
print('Most common song-artist classification label ratio:' , results.count(final_result) / len(results))
print('=' * 70)

print('All classification labels summary:')
print('=' * 70)

all_artists_labels = []

samples_overlap = 340 - chunk_size // classification_sampling_resolution // 3

for i, res in enumerate(results):
  result_toks = [res[0]-512, res[1]-512]
  song_artist = song_artist_tokens_to_song_artist(result_toks)
  gidx = genre_labels_fnames.index(song_artist)
  genre = genre_labels[gidx][1]
  print('Notes', i*(340-samples_overlap), '-', (i*(340-samples_overlap))+340, '===', genre, '---', song_artist)

  artist_label = str_strip_artist(song_artist.split(' --- ')[1])

  all_artists_labels.append(artist_label)

print('=' * 70)

mode_artist_label = statistics.mode(all_artists_labels)
mode_artist_label_count = all_artists_labels.count(mode_artist_label)

print('Aggregated artist classification label:', mode_artist_label)
print('Aggregated artist classification label ratio:', mode_artist_label_count / len(all_artists_labels))

print('=' * 70)
print('Done!')
print('=' * 70)

"""# (GENERATE)"""

# @title Generate classified music composition

#@markdown NOTE: You can stop the generation at any time to render partial results

start_from = "scratch" # @param ["scratch", "loaded_MIDI"]
number_of_blocks_to_generate = 2 # @param {type:"slider", min:1, max:25, step:1}
model_sampling_top_k_value = 25 # @param {type:"slider", min:1, max:50, step:1}
render_MIDI_to_audio = True # @param {type:"boolean"}

print('=' * 70)
print('Ultimate MIDI Classifier Music Generator')
print('=' * 70)

all_composition_classification_labels = []

print('Generating prime block...')

if start_from == "scratch":
  x = torch.tensor([[937]], dtype=torch.long, device='cuda')

else:
  x = torch.tensor([input_data[0][:511]], dtype=torch.long, device='cuda')

with ctx:
  out = model.module.generate(x,
                              1021-x.shape[1],
                              filter_logits_fn=top_k,
                              filter_kwargs={'k': model_sampling_top_k_value},
                              temperature=0.9,
                              return_prime=True,
                              verbose=False)

prime_output = out.tolist()[0]

print('=' * 70)
print('Classifiying prime block...')


x = torch.tensor([prime_output+[938]], dtype=torch.long, device='cuda')

with ctx:
  out = model.module.generate(x,
                              2,
                              filter_logits_fn=top_k,
                              filter_kwargs={'k': 1},
                              temperature=0.9,
                              return_prime=False,
                              verbose=False)

prime_output_cls = out.tolist()[0]

result_toks = [prime_output_cls[0]-512, prime_output_cls[1]-512]
song_artist = song_artist_tokens_to_song_artist(result_toks)
gidx = genre_labels_fnames.index(song_artist)
genre = genre_labels[gidx][1]

all_composition_classification_labels.append(genre + ' --- ' + song_artist)

print('=' * 70)
print('Prime block classification genre label:', genre)
print('Prime block classification song-artist label:', song_artist)
print('=' * 70)

print('Continuing generation...')
print('=' * 70)

output = []
output.extend(prime_output)

for i in range(number_of_blocks_to_generate):

  try:

    print('Generating block #', i+1)

    x = torch.tensor([[937] + output[-513:-3]], dtype=torch.long, device='cuda')

    with ctx:
      out = model.module.generate(x,
                                  1021-x.shape[1],
                                  filter_logits_fn=top_k,
                                  filter_kwargs={'k': model_sampling_top_k_value},
                                  temperature=0.9,
                                  return_prime=False,
                                  verbose=False)

    outy = out.tolist()[0]
    output.extend(outy)

    print('=' * 70)
    print('Classifiying block #', i+1)

    x = torch.tensor([output[-1021:]+[938]], dtype=torch.long, device='cuda')

    with ctx:
      out = model.module.generate(x,
                                  2,
                                  filter_logits_fn=top_k,
                                  filter_kwargs={'k': 1},
                                  temperature=0.9,
                                  return_prime=False,
                                  verbose=False)

    output_cls = out.tolist()[0]

    result_toks = [output_cls[0]-512, output_cls[1]-512]
    song_artist = song_artist_tokens_to_song_artist(result_toks)
    gidx = genre_labels_fnames.index(song_artist)
    genre = genre_labels[gidx][1]

    all_composition_classification_labels.append(genre + ' --- ' + song_artist)

    print('=' * 70)
    print('Block #', i+1, 'classification genre label:', genre)
    print('Block #', i+1, 'classification song-artist label:', song_artist)
    print('=' * 70)

  except KeyboardInterrupt:
    print('Stopping...')
    print('=' * 70)
    break

  except Exception as ex:
    print('Error!')
    print(ex)
    break

print('Converting generated blocks to MIDI...')
print('=' * 70)

print('Sample INTs', output[:15])

if len(output) != 0:

    song = output
    song_f = []

    time = 0
    dur = 0
    vel = 90
    pitch = 0
    channel = 0

    for ss in song:

        if 0 <= ss < 128:

            time += ss * 32

        if 128 < ss < 256:

            dur = (ss-128) * 32

        if 256 < ss < 512:

            chan = (ss-256) // 128

            if chan == 1:
                channel = 9
            else:
                channel = 0

            pitch = (ss-256) % 128

            if channel == 0:
              vel = max(40, pitch)
              song_f.append(['note', time, dur, channel, pitch, vel, 0])
            else:
              vel = [110, 120][pitch % 2]
              song_f.append(['note', time, dur, channel, pitch, vel, 128])

detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
                                                        output_signature = 'Ultimate MIDI Classifier',
                                                        output_file_name = '/content/Ultimate-MIDI-Classifier-Composition',
                                                        track_name='Project Los Angeles',
                                                        )

print('=' * 70)
print('Displaying resulting composition...')
print('=' * 70)

fname = '/content/Ultimate-MIDI-Classifier-Composition'

if render_MIDI_to_audio:
  midi_audio = midi_to_colab_audio(fname + '.mid')
  display(Audio(midi_audio, rate=16000, normalize=False))

TMIDIX.plot_ms_SONG(song_f, plot_title=fname)

"""# Congrats! You did it! :)"""