Skip to content

testing and improvements for artist string parsing #260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions src/backend/tests/listenbrainz/listenbrainz.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ import { UpstreamError } from "../../common/errors/UpstreamError.js";
import { ListenbrainzApiClient, ListenResponse } from "../../common/vendor/ListenbrainzApiClient.js";
import { ExpectedResults } from "../utils/interfaces.js";
import { withRequestInterception } from "../utils/networking.js";
import artistWithProperJoiner from './correctlyMapped/artistProperHasJoinerInName.json';
import artistWithProperJoiner from './correctlyMapped/artistProperHasJoinerInName.json' with { type: "json" };
// correct mappings
import multiArtistInArtistName from './correctlyMapped/multiArtistInArtistName.json';
import multiArtistsInTrackName from './correctlyMapped/multiArtistInTrackName.json';
import multiMappedArtistsWithSingleUserArtist from './correctlyMapped/multiArtistMappingWithSingleRecordedArtist.json';
import noArtistMapping from './correctlyMapped/noArtistMapping.json';
import normalizedValues from './correctlyMapped/normalizedName.json';
import slightlyDifferentNames from './correctlyMapped/trackNameSlightlyDifferent.json';
import multiArtistInArtistName from './correctlyMapped/multiArtistInArtistName.json' with { type: "json" };
import multiArtistsInTrackName from './correctlyMapped/multiArtistInTrackName.json' with { type: "json" };
import multiMappedArtistsWithSingleUserArtist from './correctlyMapped/multiArtistMappingWithSingleRecordedArtist.json' with { type: "json" };
import noArtistMapping from './correctlyMapped/noArtistMapping.json' with { type: "json" };
import normalizedValues from './correctlyMapped/normalizedName.json' with { type: "json" };
import slightlyDifferentNames from './correctlyMapped/trackNameSlightlyDifferent.json' with { type: "json" };

// incorrect mappings
import incorrectMultiArtistsTrackName from './incorrectlyMapped/multiArtistsInTrackName.json';
import veryWrong from './incorrectlyMapped/veryWrong.json';
import incorrectMultiArtistsTrackName from './incorrectlyMapped/multiArtistsInTrackName.json' with { type: "json" };
import veryWrong from './incorrectlyMapped/veryWrong.json' with { type: "json" };

interface LZTestFixture {
data: ListenResponse
Expand Down
114 changes: 114 additions & 0 deletions src/backend/tests/plays/playParsing.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import { loggerTest, loggerDebug, childLogger } from "@foxxmd/logging";
import chai, { assert, expect } from 'chai';
import asPromised from 'chai-as-promised';
import { after, before, describe, it } from 'mocha';

import { asPlays, generateArtistsStr, generatePlay, normalizePlays } from "../utils/PlayTestUtils.js";
import { parseArtistCredits, parseContextAwareStringList, parseCredits } from "../../utils/StringUtils.js";

describe('Parsing Artists from String', function() {

it('Parses Artists from an Artist-like string', function () {
for(const i of Array(20)) {
const [str, primaries, secondaries] = generateArtistsStr();
const credits = parseArtistCredits(str);
const allArtists = primaries.concat(secondaries);
const parsed = [credits.primary].concat(credits.secondary ?? [])
expect(primaries.concat(secondaries),`
'${str}'
Expected => ${allArtists.join(' || ')}
Found => ${parsed.join(' || ')}`)

.eql(parsed)
}
});

it('Parses & as "local" joiner when other delimiters present', function () {

const data = [{
str: `Melendi \\ Ryan Lewis \\ The Righteous Brothers (featuring Joan Jett & The Blackhearts \\ Robin Schulz)`,
expected: ['Melendi', 'Ryan Lewis', 'The Righteous Brothers', 'Joan Jett & The Blackhearts', 'Robin Schulz']
}, {
str: `Gigi D'Agostino \\ YOASOBI (vs Sam Hunt, Lisa Loeb & Booba)`,
expected: [`Gigi D'Agostino`, 'YOASOBI', 'Sam Hunt', 'Lisa Loeb', 'Booba']
}];

for(const d of data) {
const credits = parseArtistCredits(d.str);
const parsed = [credits.primary].concat(credits.secondary ?? [])
expect(d.expected).eql(parsed)
}

});

it('Only parses & as "global" joiner when no other delimiters present', function () {

const data = [{
str: `Melendi & Ryan Lewis & The Righteous Brothers (featuring The Blackhearts \\ Robin Schulz)`,
expected: ['Melendi', 'Ryan Lewis', 'The Righteous Brothers', 'The Blackhearts', 'Robin Schulz']
}];

for(const d of data) {
const credits = parseArtistCredits(d.str);
const parsed = [credits.primary].concat(credits.secondary ?? [])
expect(d.expected).eql(parsed)
}
});

it('Parses secondary free regex', function () {

const data = [{
str: `Diddy & Grand Funk Railroad feat. Daya & (G)I-DLE`,
expected: ['Diddy', 'Grand Funk Railroad', 'Daya', '(G)I-DLE']
}];

for(const d of data) {
const credits = parseArtistCredits(d.str);
const parsed = [credits.primary].concat(credits.secondary ?? [])
expect(d.expected).eql(parsed)
}
});

it('Parses singlar Artist with wrapped vs multiple', function () {
const [str, primaries, secondaries] = generateArtistsStr({primary: 1, secondary: {num: 2, ft: 'vs', joiner: '/', ftWrap: true}});
const credits = parseArtistCredits(str);
const moreCredits = parseCredits(str);
expect(true).eq(true);
});

describe('When joiner is known', function () {

it('Parses many primary artists', function () {
for(const i of Array(10)) {
const [str, primaries, secondaries] = generateArtistsStr({primary: {max: 3, joiner: '/'}, secondary: 0});
const credits = parseArtistCredits(str, ['/']);
const allArtists = primaries.concat(secondaries);
const parsed = [credits.primary].concat(credits.secondary ?? [])
expect(primaries.concat(secondaries),`
'${str}'
Expected => ${allArtists.join(' || ')}
Found => ${parsed.join(' || ')}`)
.eql(parsed)
}
});

it('Parses many secondary artists', function () {
// fails on -- Peso Pluma / Lil Baby / R. Kelly (featuring TOMORROW X TOGETHER / AC/DC / DaVido)
for(const i of Array(10)) {
const [str, primaries, secondaries] = generateArtistsStr({primary: {max: 3, joiner: '/'}, secondary: {joiner: '/', finalJoiner: false}});
const credits = parseArtistCredits(str, ['/']);
const allArtists = primaries.concat(secondaries);
const parsed = [credits.primary].concat(credits.secondary ?? [])
expect(primaries.concat(secondaries),`
'${str}'
Expected => ${allArtists.join(' || ')}
Found => ${parsed.join(' || ')}`)
.eql(parsed)
}
});


});


});
116 changes: 115 additions & 1 deletion src/backend/tests/utils/PlayTestUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ import isBetween from "dayjs/plugin/isBetween.js";
import relativeTime from "dayjs/plugin/relativeTime.js";
import timezone from "dayjs/plugin/timezone.js";
import utc from "dayjs/plugin/utc.js";
import { JsonPlayObject, ObjectPlayData, PlayMeta, PlayObject } from "../../../core/Atomic.js";
import { FEAT, JOINERS, JOINERS_FINAL, JsonPlayObject, ObjectPlayData, PlayMeta, PlayObject } from "../../../core/Atomic.js";
import { sortByNewestPlayDate } from "../../utils.js";
import { NO_DEVICE, NO_USER, PlayerStateDataMaybePlay, PlayPlatformId, ReportedPlayerStatus } from '../../common/infrastructure/Atomic.js';
import { arrayListAnd } from '../../../core/StringUtils.js';
import { findDelimiters } from '../../utils/StringUtils.js';

dayjs.extend(utc)
dayjs.extend(isBetween);
Expand Down Expand Up @@ -170,3 +172,115 @@ export const generatePlay = (data: ObjectPlayData = {}, meta: PlayMeta = {}): Pl
export const generatePlays = (numberOfPlays: number, data: ObjectPlayData = {}, meta: PlayMeta = {}): PlayObject[] => {
return Array.from(Array(numberOfPlays), () => generatePlay(data, meta));
}

export const generateArtist = () => faker.music.artist;

export const generateArtists = (num?: number, max: number = 3, opts: {ambiguousJoinedNames?: boolean, trailingAmpersand?: boolean} = {}) => {
if(num === 0 || max === 0) {
return [];
}
let artists = faker.helpers.multiple(faker.music.artist, {count: {min: num ?? 1, max: num ?? max}});

const {
trailingAmpersand = false,
ambiguousJoinedNames = false
} = opts;

if(!trailingAmpersand) {
// its really hard to parse an artist name that contains an '&' when it comes at the end of a list
// because its ambigious if the list is joining the list with & or if & is part of the artist name
// so by default don't generate these (we test for specific scenarios in playParsing.test.ts)
while(artists[artists.length - 1].includes('&')) {
artists = artists.slice(0, artists.length - 1).concat(faker.music.artist());
}
}
if(!ambiguousJoinedNames) {
artists = artists.map(x => {
let a = x;
let foundDelims = findDelimiters(a);
while(foundDelims !== undefined && foundDelims.length > 0 && !(foundDelims.length === 1 && foundDelims[0] === '&')) {
a = faker.music.artist();
foundDelims = findDelimiters(a);
}
return a;
});
}
return artists;
}

export interface ArtistGenerateOptions {
num?: number
max?: number
joiner?: string
finalJoiner?: false | string
spacedJoiners?: boolean
}

export interface SecondaryArtistGenerateOptions extends ArtistGenerateOptions {
ft?: string
ftWrap?: boolean
}

export interface CompoundArtistGenerateOptions {
primary?: number | ArtistGenerateOptions
secondary?: number | SecondaryArtistGenerateOptions
}

export const generateArtistsStr = (options: CompoundArtistGenerateOptions = {}): [string, string[], string[]] => {

const {primary = {}, secondary = {}} = options;

const primaryOpts: ArtistGenerateOptions = typeof primary === 'number' ? {num: primary} : primary;
const secondaryOpts: SecondaryArtistGenerateOptions = typeof secondary === 'number' ? {num: secondary} : secondary;

const primaryArt = generateArtists(primaryOpts.num, primaryOpts.max)
const secondaryArt = generateArtists(secondaryOpts.num, secondaryOpts.max);


const joinerPrimary: string = primaryOpts.joiner ?? faker.helpers.arrayElement(JOINERS);
let finalJoinerPrimary: string = joinerPrimary;
if(primaryOpts.finalJoiner !== false) {
if(primaryOpts.finalJoiner === undefined) {
if(joinerPrimary === ',' && !primaryArt.some(x => x.includes('&'))) {
finalJoinerPrimary = faker.helpers.arrayElement(JOINERS_FINAL);
}

} else {
finalJoinerPrimary = primaryOpts.finalJoiner;
}
}

const primaryStr = arrayListAnd(primaryArt, joinerPrimary, finalJoinerPrimary, primaryOpts.spacedJoiners);

if(secondaryArt.length === 0) {
return [primaryStr, primaryArt, []];
}

const joinerSecondary: string = secondaryOpts.joiner ?? faker.helpers.arrayElement(JOINERS);
let finalJoinerSecondary: string = joinerSecondary;
if(secondaryOpts.finalJoiner !== false) {
if(secondaryOpts.finalJoiner === undefined) {
if(joinerSecondary === ',' && !secondaryArt.some(x => x.includes('&'))) {
finalJoinerSecondary = faker.helpers.arrayElement(JOINERS_FINAL);
}
} else {
finalJoinerSecondary = secondaryOpts.finalJoiner;
}
}

const secondaryStr = arrayListAnd(secondaryArt, joinerSecondary, finalJoinerSecondary, secondaryOpts.spacedJoiners);
const ft = secondaryOpts.ft ?? faker.helpers.arrayElement(FEAT);
let sec = `${ft} ${secondaryStr}`;
let wrap: boolean;
if(secondaryOpts.ftWrap !== undefined) {
wrap = secondaryOpts.ftWrap;
} else {
wrap = faker.datatype.boolean();
}
if(wrap) {
sec = `(${sec})`;
}
const artistStr = `${primaryStr} ${sec}`;

return [artistStr, primaryArt, secondaryArt];
}
52 changes: 48 additions & 4 deletions src/backend/utils/StringUtils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { strategies, stringSameness, StringSamenessResult } from "@foxxmd/string-sameness";
import { PlayObject } from "../../core/Atomic.js";
import { asPlayerStateData, DELIMITERS, PlayerStateDataMaybePlay } from "../common/infrastructure/Atomic.js";
import { genGroupIdStr, getPlatformIdFromData, parseRegexSingleOrFail } from "../utils.js";
import { genGroupIdStr, getPlatformIdFromData, intersect, parseRegexSingleOrFail } from "../utils.js";
import { buildTrackString } from "../../core/StringUtils.js";

const {levenStrategy, diceStrategy} = strategies;
Expand Down Expand Up @@ -61,7 +61,7 @@ export const SECONDARY_CAPTURED_REGEX = new RegExp(/[([]\s*(?<joiner>ft\.?\W|fea
* !!!! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ *******
*
* */
export const SECONDARY_FREE_REGEX = new RegExp(/^\s*(?<joiner>ft\.?\W|feat\.?\W|featuring|vs\.?\W)\s*(?<credits>(?:.+?(?= - |\s*[([]))|(?:.*))(?<creditsSuffix>.*)/i);
export const SECONDARY_FREE_REGEX = new RegExp(/^\s*(?<joiner>ft\.?\W|feat\.?\W|featuring|vs\.?\W)\s*(?<credits>(?:.+?(?= - |\s*[([].+[)\]]$))|(?:.*))(?<creditsSuffix>.*)/i);

const SECONDARY_REGEX_STRATS: RegExp[] = [SECONDARY_CAPTURED_REGEX, SECONDARY_FREE_REGEX];

Expand Down Expand Up @@ -116,7 +116,7 @@ export const parseCredits = (str: string, delimiters?: boolean | string[]): Play
for(const strat of SECONDARY_REGEX_STRATS) {
const secCredits = parseRegexSingleOrFail(strat, results.named.secondary);
if(secCredits !== undefined) {
secondary = parseStringList(secCredits.named.credits as string, delims)
secondary = parseContextAwareStringList(secCredits.named.credits as string, delims)
suffix = secCredits.named.creditsSuffix;
break;
}
Expand Down Expand Up @@ -148,7 +148,7 @@ export const parseArtistCredits = (str: string, delimiters?: boolean | string[])
if (withJoiner !== undefined) {
// all this does is make sure and "ft" or parenthesis/brackets are separated --
// it doesn't also separate primary artists so do that now
const primaries = parseStringList(withJoiner.primary, delims);
const primaries = parseContextAwareStringList(withJoiner.primary, delims);
if (primaries.length > 1) {
return {
primary: primaries[0],
Expand Down Expand Up @@ -182,6 +182,50 @@ export const parseStringList = (str: string, delimiters: string[] = [',', '&', '
return explodedStrings.flat(1);
}, [str]).map(x => x.trim());
}
export const parseContextAwareStringList = (str: string, delimiters: string[] = [',', '/', '\\'], opts: {ignoreGlobalAmpersand?: boolean} = {}): string[] => {
if (delimiters.length === 0) {
return [str];
}
// bypass tokens using slashes without spaces
const cleanStr = bypassJoiners(str);
const nonAmpersandDelims = delimiters.some(x => cleanStr.includes(x));
const shouldIgnoreGlobalAmpersand = opts.ignoreGlobalAmpersand ?? nonAmpersandDelims;

let awareList: string[] = [];

const list = parseStringList(cleanStr, nonAmpersandDelims === false && shouldIgnoreGlobalAmpersand === false ? ['&'] : delimiters);
if(shouldIgnoreGlobalAmpersand && list.length > 1 && list[list.length - 1].includes('&') && nonAmpersandDelims) { //&& !list[list.length - 1].includes('& the')
awareList = list.slice(0, list.length - 1).concat(list[list.length - 1].split('&') );
} else {
awareList = list;
}
return awareList.map(x =>rejoinBypassed(x.trim()));
}

const bypassJoinerMap = [
{
rejoin: str => str.replaceAll(/(.*?\S)(\^\^\^)(\S.*?)/g, '$1/$3'),
bypass: str => str.replaceAll(/(.*?\S)(\/)(\S.*?)/g, '$1^^^$3')
},
{
rejoin: str => str.replaceAll(/(.*)(###)(.*)/g, '$1\\$3'),
bypass: str => str.replaceAll(/(.*\S)(\\)(.*\S)/g, '$1###$3')
}
];
export const bypassJoiners = (str: string): string => {
let bypassed: string = str;
for(const b of bypassJoinerMap) {
bypassed = b.bypass(bypassed)
}
return bypassed;
}
export const rejoinBypassed = (str: string): string => {
let bypassed: string = str;
for(const b of bypassJoinerMap) {
bypassed = b.rejoin(bypassed)
}
return bypassed;
}
export const containsDelimiters = (str: string) => null !== str.match(/[,&/\\]+/i)
export const findDelimiters = (str: string) => {
const found: string[] = [];
Expand Down
11 changes: 10 additions & 1 deletion src/core/Atomic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,4 +275,13 @@ export interface URLData {
url: URL
normal: string
port: number
}
}

export type Joiner = ',' | '&' | '/' | '\\' | string;
export const JOINERS: Joiner[] = [',','/','\\'];

export type FinalJoiners = '&';
export const JOINERS_FINAL: FinalJoiners[] = ['&'];

export type Feat = 'ft' | 'feat' | 'vs' | 'ft.' | 'feat.' | 'vs.' | 'featuring'
export const FEAT: Feat[] = ['ft','feat','vs','ft.','feat.','vs.','featuring'];
Loading