diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c1e9646 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target +/*.iml +/.* +!/.gitignore diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1914ab6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,6 @@ +# Change Log + +## 0.1.0 - 2018-03-16 +### Added +- Conversion of MSD files into H2 +- README with usage instructions diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ad88984 --- /dev/null +++ b/LICENSE @@ -0,0 +1,16 @@ +Copyright (c) 2018 Nicolas Duchenne, Belove Ltd, London, UK + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software +and associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..dfb6b71 --- /dev/null +++ b/README.md @@ -0,0 +1,119 @@ +# MSD Lyrics SQL database + +A command line tool to load the lyrics subset of the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/) +into an [H2 SQL database](http://www.h2database.com/html/main.html). + +A SQL database makes it easy to inspect, clean, aggregate, filter and slice the dataset, via a GUI or programmatically. + + +## Installation + + +Install [Java](http://java.com/en/download/), download the jar file from the release page of this repository and follow the instructions below. + +Alternatively, you can clone this repository and run the code with [Leiningen](http://leiningen.org/), the build automation tool +for [Clojure](http://clojure.org). Start by editing the `msd.edn` file in the project root (see further below) and then execute +the code with `lein run`. + +You don't need to install H2 on your machine to run the program. The database engine is embedded in the program. + +However, you'll need some H2 compatible tool to view the data. An option is to [install H2](http://www.h2database.com/html/download.html) +and use the included [console application](http://www.h2database.com/html/quickstart.html). +Another is to use an H2 compatible front-end such as [DataGrip](https://www.jetbrains.com/datagrip/). + + + + +## Usage + +Gather the following files from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/) and place them in +a same directory (while you're browsing the websites, check the licensing/citing terms for the various subsets): + +- from https://labrosa.ee.columbia.edu/millionsong/musixmatch: + - mxm_779k_matches.txt + - mxm_dataset_test.txt + - mxm_dataset_train.txt + - mxm_reverse_mapping.txt +- from https://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset: + - tracks_per_year.txt +- from http://www.ifs.tuwien.ac.at/mir/msd/download.html#groundtruth: + - msd-MASD-styleAssignment.cls + +Place the jar file into the same directory and run: + + $ java -jar msd-to-h2-0.1.0-standalone.jar + +Give it a few minutes to create the output files: + + Creating the csv files... + Creating the database. Just a bit of patience. + Creating primary tables... + Creating indexes... + Creating derived tables... + All done! + The 3 artists with the largest vocabulary in the Million Song Dataset are Aesop Rock with 2555 words, Eminem with 2526 words, Cypress Hill with 2476 words + + + + +## Outputs + +The program runs in two stages. + +Firsty, the program converts the original MSD files into CSV files. Words and tracks are given new unique integer ids, +and files that relate to each others are consolidated (e.g. tracks + track years + track genres). + +Secondly, the program uploads the resulting CSV files into a new H2 database. Tables are created for tracks, words and the track/word matrix. +The tool also creates a table of artists (based on the MusicXMatch artist names in the dataset) with aggregate track count, vocabulary count, +and year range for each artist. This list of artists is preliminary and is meant to help prioritize data cleaning, rather than being used as is. + + + + +## Options + +Rather than having all input and output files in the same directory, it is possible to specify different locations for input files, csv outputs files +and the database. To do this, create an [edn file](https://learnxinyminutes.com/docs/edn/) called `msd.edn` with the following keys: + +- `:in` directory of the MSD input files +- `:csv` directory of the csv output files +- `:db` output database file according to +[H2's URL format for an embedded database](http://www.h2database.com/html/features.html#database_url), without the `jdbc:h2:` prefix. +The code was tested for a location relative to the `msd.edn` file (`./`) +or to the user's home directory (`~/`). + +For example: + + {:in "/data/msd/source/" + :csv "/data/msd/csv/" + :db "~/h2data/msd"} + +You can then run `java -jar msd-to-h2-0.1.0-standalone.jar` at the location of the `msd.edn` file. + +If there is no `msd.edn` file (like earlier) then the program defaults to the following parameters: + + {:in "./" + :csv "./" + :db "./msd"} + + + + +## Other SQL Engines + +Amending the code to accommodate other SQL engines should be straightforward +. +You'll have to: + +- [ ] change the database driver dependency in `project.clj` +- [ ] adapt the database spec in `src/intoh2/core.clj` (look for`db-spec` and `create-database`) +- [ ] adapt the scripts in `resources/sql` to the SQL dialect of the new database + + + + +## License + +Copyright © 2018 Nicolas Duchenne, Belove Ltd, London, UK + +Released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/msd.edn b/msd.edn new file mode 100644 index 0000000..6403ad7 --- /dev/null +++ b/msd.edn @@ -0,0 +1,3 @@ +{:in "/data/millionsong/dataset/extract" + :csv "/data/millionsong/dataset/csv" + :db "~/data/millionsong/dataset/h2/msd;AUTO_SERVER=TRUE;AUTO_SERVER_PORT=9091"} \ No newline at end of file diff --git a/project.clj b/project.clj new file mode 100644 index 0000000..f360119 --- /dev/null +++ b/project.clj @@ -0,0 +1,18 @@ +(defproject msd-to-h2 "0.1.0" + :description "Converts lyrics files in the Million Song Dataset into an H2 database." + :url "https://github.com/belovehq/msd-to-h2" + :license {:name "MIT License" + :url "https://opensource.org/licenses/MIT"} + :dependencies [[org.clojure/clojure "1.9.0"] + [com.layerware/hugsql "0.4.8"] + [com.h2database/h2 "1.4.195"]] + + :source-paths ["src"] + :target-path "target/%s" + :resource-paths ["resources"] + + :main ^:skip-aot intoh2.core + + :profiles {:uberjar {:aot :all}}) + + diff --git a/resources/sql/derivedtables.sql b/resources/sql/derivedtables.sql new file mode 100644 index 0000000..d10e5f1 --- /dev/null +++ b/resources/sql/derivedtables.sql @@ -0,0 +1,39 @@ +-- Create and populate artists table + +CREATE TABLE msdartists ( + artistid INT PRIMARY KEY AUTO_INCREMENT, + mxmartistname VARCHAR(55), + trackcount INT, + vocabulary INT, + fromyear INT, + toyear INT +); + + +INSERT INTO msdartists (mxmartistname, trackcount, vocabulary, fromyear, toyear) + SELECT + t.mxmartistname AS mxmartistname, + COUNT(DISTINCT t.trackid) AS trackcount, + COUNT(DISTINCT m.wordid) AS vocabulary, + MIN(t.trackyear) AS fromyear, + MAX(t.trackyear) AS toyear + FROM msdtracks t + INNER JOIN matrix M ON M.trackid = t.trackid + GROUP BY mxmartistname + ORDER BY mxmartistname; + +CREATE INDEX IX_msdartists_artistname + ON msdartists (mxmartistname); + +-- -- Create and populate genres table + +CREATE TABLE msdgenres ( + genreid INT AUTO_INCREMENT PRIMARY KEY, + masdgenre VARCHAR(20) +); + +INSERT INTO msdgenres (masdgenre) + SELECT DISTINCT masdgenre + FROM msdtracks + WHERE masdgenre IS NOT NULL + ORDER BY masdgenre; diff --git a/resources/sql/hello.sql b/resources/sql/hello.sql new file mode 100644 index 0000000..067a0ce --- /dev/null +++ b/resources/sql/hello.sql @@ -0,0 +1,7 @@ +-- :name hello :query :one +SELECT CONCAT('The 3 artists with the largest vocabulary in the Million Song Dataset are ', + GROUP_CONCAT(s SEPARATOR ', ')) out +FROM + (SELECT TOP 3 CONCAT(mxmartistname, ' with ', vocabulary, ' words') s + FROM msdartists + ORDER BY vocabulary DESC); \ No newline at end of file diff --git a/resources/sql/indexes.sql b/resources/sql/indexes.sql new file mode 100644 index 0000000..d434f2d --- /dev/null +++ b/resources/sql/indexes.sql @@ -0,0 +1,12 @@ +-- indexes on msdtracks +CREATE PRIMARY KEY ON msdtracks (trackid); +CREATE UNIQUE INDEX ix_msdtracks_entrackid ON msdtracks (entrackid); +CREATE INDEX ix_msdtracks_masdgenre ON msdtracks (masdgenre); +CREATE INDEX ix_msdtracks_mxmartistname ON msdtracks (mxmartistname); + +-- index on msdwords +CREATE PRIMARY KEY ON msdwords (wordid); + +-- indexes on matrix +CREATE PRIMARY KEY ON matrix (trackid, wordid); +CREATE INDEX ix_matrix_wordid_trackid ON matrix (wordid, trackid); diff --git a/resources/sql/primarytables.sql b/resources/sql/primarytables.sql new file mode 100644 index 0000000..72cf966 --- /dev/null +++ b/resources/sql/primarytables.sql @@ -0,0 +1,37 @@ +-- Command definitions for HughSQL. + +-- :name create-tracks-table :execute :raw +CREATE TABLE msdtracks ( + trackid INT NOT NULL, + entrackid VARCHAR(18), + mxmtrackid INT, + istest INT, + entrackttitle VARCHAR(250), + mxmtracktitle VARCHAR(180), + enartistname VARCHAR(100), + mxmartistname VARCHAR(55), + trackyear INT, + masdgenre VARCHAR(20) +) AS + SELECT * + FROM CSVREAD(:sql:file); + +-- :name create-words-table :execute :raw +CREATE TABLE msdwords ( + wordid INT NOT NULL, + stem VARCHAR(15), + word VARCHAR(15) +) AS + SELECT * + FROM CSVREAD(:sql:file); + +-- :name create-matrix-table :execute :raw +CREATE TABLE matrix ( + trackid INT NOT NULL, + wordid INT NOT NULL, + count INT NOT NULL +) AS + SELECT * + FROM CSVREAD(:sql:file); + + diff --git a/resources/sql/safety.sql b/resources/sql/safety.sql new file mode 100644 index 0000000..265c994 --- /dev/null +++ b/resources/sql/safety.sql @@ -0,0 +1,3 @@ +-- re-enable database logs +SET LOG 1; +SET UNDO_LOG 1; \ No newline at end of file diff --git a/resources/sql/speed.sql b/resources/sql/speed.sql new file mode 100644 index 0000000..efb8632 --- /dev/null +++ b/resources/sql/speed.sql @@ -0,0 +1,3 @@ +-- momentarily disable database logs +SET LOG 0; +SET UNDO_LOG 0; diff --git a/src/intoh2/core.clj b/src/intoh2/core.clj new file mode 100644 index 0000000..313032a --- /dev/null +++ b/src/intoh2/core.clj @@ -0,0 +1,316 @@ +(ns intoh2.core + (:require [clojure.java.io :refer [reader writer file resource as-file]] + [clojure.string :as s] + [intoh2.sql :as sql] + [clojure.java.jdbc :refer [with-db-connection]]) + (:gen-class)) + + + +; specs + +(def msd-spec + "the map of source MSD files" + {:years {:name "tracks_per_year.txt"} + :stems {:name "mxm_reverse_mapping.txt"} + :matches {:name "mxm_779k_matches.txt"} + :genres {:name "msd-MASD-styleAssignment.cls" :separator "\t"} + :train {:name "mxm_dataset_train.txt"} + :test {:name "mxm_dataset_test.txt"}}) + + +(def csv-spec + "the map of CSV files that will be produced from the MSD files before loading into H2" + {:years {:name "msdyears.csv" + :header ["year" "entrackid" "artistname" "tracktitle"]} + :stems {:name "mxmstems.csv" + :header ["stem" "word"]} + :matches {:name "mxmmatches.csv" + :header ["entrackid" "enartistname" "entracktitle" + "mxmtrackid" "mxmartistname" "mxmtracktitle"]} + :genres {:name "masdgenres.csv" + :header ["entrackid", "masdgenre"]} + :words {:name "mxmwords.csv" + :header ["wordid" "stem"]} + :tracks {:name "mxmtracks.csv" + :header ["trackid" "entrackid" "mxmtrackid" "test"]} + :final-pairs {:name "h2matrix.csv" + :header ["trackid" "wordid" "count"]} + :final-words {:name "h2words.csv" + :header ["wordid" "stem" "word"]} + :final-tracks {:name "h2tracks.csv" + :header ["trackid" "entrackid" "mxmtrackid" "test" "entracktitle" + "mxmtracktitle" "enartistname" "mxmartistname" "trackyear" "masdgenre"]}}) + + +(def db-spec + "Base h2 database spec" + {:classname "org.h2.Driver" + :subprotocol "h2" + :subname "" + :user "" + :password "" + :DB_CLOSE_DELAY "-1"}) + + +(defn pathify [s] + "Append a slash at the end of a string if none is present" + (let [t (s/trim s)] + (if (or (= t "") (#{\/ \\} (last t))) t (str t "/")))) + + +(def path-spec + "the map of input and output paths: + :in the path of the input msd files + :csv the path where to create csv files + :db the database file to create" + (try + (-> (clojure.edn/read-string (slurp "msd.edn")) + (update-in [:in] pathify) + (update-in [:csv] pathify)) + (catch Exception e + {:in "./" + :csv "./" + :db "./msd"}))) + + + +; pure functions + + +(defn vec-to-csv + "convert a vector to a csv line, incl. line break" + [v] (apply str (concat (interpose "," v) "\n"))) + + +(defn cs-to-vec + "convert a comma separated string to a vector of strings" + [s] (s/split s #",")) + + +(defn msd-full-path + "return the full path specified for the csv file that has key k" + [k] (str (:in path-spec) (get-in msd-spec [k :name]))) + + +(defn csv-full-path + "return the full path specified for the csv file that has key k" + [k] (str (:csv path-spec) (get-in csv-spec [k :name]))) + + +(defn csv-header + "return the comma-separated header specified for the csv file that has key k" + [k] (vec-to-csv (get-in csv-spec [k :header]))) + + +(defn msd-separator + "return the separator specified for the csv file that has key k" + [k] (let [sep (get-in msd-spec [k :separator])] + (if (nil? sep) "" sep))) + + +(defn tabular-to-csv + "convert a tabular line (with separator sep) to a CSV line. Insert id at start of line if supplied. Also replace double quotes with simple quotes." + [sep line id] + (if (= (first line) \#) "" + (str (if id (str id ",")) + (-> line + (s/replace sep ",") + (s/replace \" \')) + "\n"))) + + +(defn seq-of-ids + "return an infinite sequence of int starting at 1 if b is true, and of nil otherwise." + [b] (if b (rest (range)) + (repeat nil))) + + + +; read and write + + +(defn convert-msd-file-to-csv + "convert a tabular MSD file into a CSV file. + in: path of the msd file + out: path of the csv file to produce + sep: separator in the MSD file + header: header of the csv file, comma separated and including line break + insertid: if true then insert an incremental id at the beginning of each row" + [in out sep header insertids] + (with-open [rdr (reader in) wtr (writer out)] + (if header (.write wtr header)) + (doseq [line (map (partial tabular-to-csv sep) (line-seq rdr) (seq-of-ids insertids))] + (.write wtr line)))) + + +(defn convert-msd-files-to-csv + "convert a set of tabular MSD files into CSV files. + keys: sequence of file keys" + ([keys] (doseq [k keys] + (let [in (msd-full-path k) + out (csv-full-path k) + sep (msd-separator k) + header (csv-header k)] + (convert-msd-file-to-csv in out sep header nil))))) + + +(defn write-string-to-csv + "Write a comma separated string into a CSV file with one entry per line. + out: path of the destination file + s: input string + header: header of the csv file, comma separated and including line break + insertids: if true then the first column of the csv file will be an incremental integer id" + ([out s header insertids] + (let [tocsv (fn [w id] (str (if id (str id ",")) w "\n"))] + (with-open [wtr (writer out)] + (if header (.write wtr header)) + (doseq [line (map tocsv (cs-to-vec s) (seq-of-ids insertids))] + (.write wtr line)))))) + + + +(defn csv-file-to-map + "convert a csv file into a map {key1 {:col1 ... :col2 ...} key2 {:col1 ... :col2 ...} ... }. + file: full path of the csv file + keycol: in which column to get the keys of the map + include-cols: which columns to include in the output (if nil, include all columns)" + ([file key-col] (csv-file-to-map file key-col nil)) + ([file key-col include-cols] + (with-open [rdr (reader file)] + (let [lines (line-seq rdr) + csvheader (map keyword (cs-to-vec (first lines))) + add-kv-pair (fn [out line] + (let [m (zipmap csvheader (cs-to-vec line)) + k (key-col m) + v (if include-cols (select-keys m include-cols) m)] + (assoc out k v)))] + (reduce add-kv-pair {} (rest lines)))))) + + +(defn ^:private write-lyricsdataset-to-csv + "Convert a single musicXmatch lyrics matrix and append it to CSV files. + in: the musicXmatch lyrics matrix file name, either the training file or the test file + tw, pw: writers to the tracks csv file and the pairs csv file + seed: the point from which to start the tracks numbering" + [in tw pw seed] + (let [is-test-set (> seed 0) + + test-flag (if is-test-set 1 0) + + write-words (fn [s] (write-string-to-csv (csv-full-path :words) s (csv-header :words) true)) + + write-track-data (fn [s trackid] (let [[msdtrackid mxmtrackid & word-counts] (cs-to-vec s)] + (.write tw (vec-to-csv [trackid msdtrackid mxmtrackid test-flag])) + (doseq [word-count word-counts] + (let [[wordid n] (s/split word-count #":")] + (.write pw (vec-to-csv [trackid wordid n])))))) + + do-track (fn [oldid s] (case (first s) + \# oldid + \% (do (if (not is-test-set) (write-words (subs s 1))) oldid) + (let [trackid (inc oldid)] (write-track-data s trackid) trackid)))] + + (with-open [rdr (reader in)] + (reduce do-track seed (line-seq rdr))))) + + +(defn write-lyrics-to-csv + "Convert the two musicXmatch lyrics matrices into aggregated CSV files" + [] (let [out-tracks (csv-full-path :tracks) + out-words (csv-full-path :words) + out-pairs (csv-full-path :final-pairs)] + (with-open [tw (writer out-tracks) pw (writer out-pairs)] + (.write tw (csv-header :tracks)) + (.write pw (csv-header :final-pairs)) + (let [seed (write-lyricsdataset-to-csv (msd-full-path :train) tw pw 0)] + (write-lyricsdataset-to-csv (msd-full-path :test) tw pw seed))))) + + +(defn join-track-files + [] + "Consolidate track data into a single csv file. Sources are: + a) the tracks found in the mxm training and test datasets + b) the mxm/msd track matching list + c) the list of tracks with years + d) the list of tracks with genres" + (let [outfile (csv-full-path :final-tracks) + matches (csv-file-to-map (csv-full-path :matches) :entrackid) + years (csv-file-to-map (csv-full-path :years) :entrackid [:year]) + styles (csv-file-to-map (csv-full-path :genres) :entrackid [:masdgenre])] + (with-open [rdr (reader (csv-full-path :tracks)) wtr (writer outfile)] + (.write wtr (csv-header :final-tracks)) + (doseq [line (rest (line-seq rdr))] + (let [entrackid (second (cs-to-vec line)) + mx (get matches entrackid) + enartist (get mx :enartistname "") + mxmartist (get mx :mxmartistname "") + s (vec-to-csv [line + (get mx :entracktitle "") + (get mx :mxmtracktitle "") + enartist + mxmartist + (get-in years [entrackid :year] "") + (get-in styles [entrackid :masdgenre] "")])] + (.write wtr s)))))) + + +(defn join-word-files + ([] + "Consolidate word data into a single csv file. Sources are: + a) the list of stemmed words found in the mxm training dataset + b) the stem/word matching list" + (let [outfile (csv-full-path :final-words) + stems (csv-file-to-map (csv-full-path :stems) :stem [:word])] + (with-open [rdr (reader (csv-full-path :words)) wtr (writer outfile)] + (.write wtr (csv-header :final-words)) + (doseq [line (rest (line-seq rdr))] + (let [stem (second (cs-to-vec line))] + (.write wtr (vec-to-csv [line (get-in stems [stem :word] "")])))))))) + + + +; orchestration + + +(defn create-csv-files + "Create all csv files based on all MSD files" + [] (do + (convert-msd-files-to-csv [:years :stems :matches :genres]) + (write-lyrics-to-csv) + (join-track-files) + (join-word-files))) + + +(defn create-database + "Create the database based on all csv files" + [] (let [db (assoc db-spec :subname (:db path-spec))] + (with-db-connection [con db] + (sql/runscript con "sql/speed.sql") + (println "Creating primary tables...") + (sql/create-tracks-table con {:file (str "'" (csv-full-path :final-tracks) "'")}) + (sql/create-words-table con {:file (str "'" (csv-full-path :final-words) "'")}) + (sql/create-matrix-table con {:file (str "'" (csv-full-path :final-pairs) "'")}) + (println "Creating indexes...") + (sql/runscript con "sql/indexes.sql") + (println "Creating derived tables...") + (sql/runscript con "sql/derivedtables.sql") + (sql/runscript con "sql/safety.sql") + (println "All Done!") + (println (:out (sql/hello con)))))) + + +(defn -main + "Create the csv files and the database" + [& args] + (try + (do + (if (not= #{:csv :db :in} (set (keys path-spec))) + (throw (Exception. "msd.edn is missing or invalid."))) + (println "Creating the csv files...") + (create-csv-files) + (println "Creating the database. Just a bit of patience.") + (create-database)) + (catch Exception e + (println (str "An error occured: " (.getMessage e)))))) + diff --git a/src/intoh2/sql.clj b/src/intoh2/sql.clj new file mode 100644 index 0000000..6f1a78f --- /dev/null +++ b/src/intoh2/sql.clj @@ -0,0 +1,14 @@ +(ns intoh2.sql + (:require [hugsql.core :as h] + [clojure.java.io :refer [resource]])) + +; create stubs for sql queries +(h/def-db-fns "sql/primarytables.sql") +(h/def-db-fns "sql/hello.sql") + + +(defn runscript + "run a script from a resource file." + [db r] + (h/db-run db (slurp (resource r)) {} :execute)) +