Visualizing MusicBrainz data evolution¶

I showed in previous notebooks how to analyze data from the MusicBrainz database, regarding composers, works, performers... now I'd like to look at metadata: the way the data in musicbrainz changed with time, i.e. how new edits modify the

%run startup.ipy

Last notebook update: 2018-06-06
Git repo: git@bitbucket.org:loujine/musicbrainz-dataviz.git
Importing libs

Defining database parameters

Defining *sql* helper function
Last database update: 2018-06-02

Python packages versions:
numpy       1.14.3
pandas      0.23.0
sqlalchemy  1.2.8
CPython 3.7.0b5
IPython 6.4.0

Editors

activity curve
failed edits by date

Composer

activity curve
edits grouped by editor, edit type, date

userscripts

number of editors
number of edits

BNF links
wikidata links

Edit history of a composer¶

artist_name = 'Páll Ísólfsson'

def edits_for_artist(artist_name):
    df = sql("""
SELECT edit.open_time AS edit_date, 
       1 AS edit_count,
       editor.name AS editor,
       edit.type AS edit_type,
       edit_data.data AS json
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
  JOIN editor ON editor.id = edit.editor
 WHERE artist.name = %(artist_name)s
""", artist_name=artist_name)
    df['edit_date'] = df['edit_date'].apply(lambda d: d.date())
    return df

edits = edits_for_artist(artist_name)
edits.head()

set(edits.edit_type)

{1, 2, 9, 20, 21, 31, 32, 34, 35, 37, 42, 51, 52, 72, 90, 91, 314}

df = edits
df[['edit_date', 'edit_type']].groupby('edit_date').count()

edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
#print(edit_frequency)
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})

edit_frequency = df.groupby(['edit_date', 'edit_type']).sum()
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})

#edits = edits_for_artist('Johann Sebastian Bach')
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})

from collections import Counter
Counter(edits.edit_type)

Counter({1: 1,
         51: 5,
         90: 59,
         52: 8,
         72: 8,
         2: 3,
         20: 3,
         31: 3,
         34: 3,
         37: 1,
         35: 2,
         21: 1,
         32: 2,
         42: 4,
         314: 1,
         91: 1,
         9: 1})

# new recordings
#pprint(edits[edits.edit_type == 51].json.values[0])

# recordings changed artist
pprint(edits[edits.edit_type == 72].json.values[0])

{'entity': {'gid': 'b6e0629c-2900-4470-90bd-fd757cc2b396',
            'id': 14958450,
            'name': 'Three pianopieces, op. 5 - Capriccio'},
 'new': {'artist_credit': {'names': [{'artist': {'id': 1011746,
                                                 'name': 'Gísli Magnússon'},
                                      'join_phrase': '',
                                      'name': 'Gísli Magnússon'}]}},
 'old': {'artist_credit': {'names': [{'artist': {'id': 1011747,
                                                 'name': 'Páll Ísólfsson'},
                                      'join_phrase': '',
                                      'name': 'Páll Ísólfsson'}]}}}

Edit types on work¶

from collections import Counter

WORKGID = '3308d248-b287-38e9-9718-0119d7a3bf9d'
df = sql("""
SELECT edit.type
--, edit.open_time
--, edit_data.data 
FROM work
JOIN edit_work ON work.id=edit_work.work
JOIN edit ON edit.id=edit_work.edit
JOIN edit_data ON edit_data.edit=edit_work.edit
WHERE work.gid='%s'
-- AND edit.type NOT IN (90,72,51,42, 1, 52, 2, 20, 31, 34, 37, 35, 21, 32, 314,91,9)
-- LIMIT 1
""" % WORKGID)

pprint(Counter(df.type))

Counter({90: 222, 91: 123, 233: 97, 92: 20, 234: 15, 42: 5, 44: 3, 99: 1})

def max_edited_entities(entity_type, min_count=1000):
    assert entity_type in ('work', 'artist')
    return sql(
        """
WITH results AS (
    SELECT COUNT(*) AS edit_count, 
           {entity_type}.name,
           'https://musicbrainz.org/{entity_type}/' || {entity_type}.gid AS url
    FROM {entity_type}
    JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
    WHERE {entity_type}.name NOT IN ('[unknown]', 'Deleted Artist', 'Various Artists')
    GROUP BY {entity_type}.id
)
SELECT * 
FROM results
WHERE edit_count > %(min_count)s
ORDER BY edit_count DESC
LIMIT 100
""".format(entity_type=entity_type), entity_type=entity_type, min_count=min_count)

#fix url
iplot(ff.create_table(max_edited_entities('work', min_count=1000)))

def query_edits(entity_type, entity_mbid):
    assert entity_type in ('work', 'artist')
    template = """
SELECT edit.open_time AS date, 
       1 AS count, 
       edit.type AS type,
       editor.name AS editor,
       edit_data.data AS data
FROM {entity_type}
JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
JOIN edit ON edit.id=edit_{entity_type}.edit
JOIN edit_data ON edit_data.edit=edit_{entity_type}.edit
JOIN editor ON editor.id=edit.editor
WHERE {entity_type}.gid=%(entity_mbid)s
""".format(entity_type=entity_type)
    df = sql(template, entity_mbid=entity_mbid)
    df['date'] = df['date'].apply(lambda d: d.date())
    return df

# Beethoven's Ninth, fourth movement
ninth = query_edits('work', '3761b925-82b2-3a61-9581-57751dee2e9e')
print(ninth.head())

         date  count  type     editor  \
0  2010-08-08      1   234  gswanjord   
1  2010-08-10      1   233  gswanjord   
2  2010-08-10      1   233  gswanjord   
3  2010-08-10      1   233  gswanjord   
4  2010-08-10      1   233  gswanjord   

                                                data  
0  {'new': {'links': [{'entity0_id': '1021', 'ent...  
1  {'links': [{'entity0_id': '1021', 'entity1_id'...  
2  {'links': [{'entity0_id': '1021', 'entity1_id'...  
3  {'links': [{'entity0_id': '1021', 'entity1_id'...  
4  {'links': [{'entity0_id': '1021', 'entity1_id'...

ninth.shape

(1242, 5)

composer history: nb of recordings vs. edit date events: new recording change recording artist merge recording delete recording

test¶

artist_name='Páll Ísólfsson'
#artist_name='Claude Debussy'

EDIT_RECORDING_CREATE = 71
EDIT_RECORDING_EDIT = 72
EDIT_RECORDING_DELETE = 73
EDIT_RECORDING_MERGE = 74

def edits_from_new_medium(artist_name):
    artist_id = sql("SELECT id FROM artist WHERE name=%(artist_name)s", 
                    artist_name=artist_name).id[0]
    df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
       edit_data.data->'tracklist' AS tracklist
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
   AND edit.type = 51
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    df['cnt'] = df.tracklist.apply(lambda tl: [t['artist_credit']['names'][0]['artist']['id'] 
                                               for t in tl].count(artist_id))
    del df['tracklist']
    return df

def edits_from_standalone_recordings(artist_name):
    df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
   AND edit.type = 71
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    return df

def edits_from_recordings_changed_from(artist_name):
    df = sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 72
  AND artist.id = CAST(edit_data.data->'old'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    df['cnt'] = - df['cnt']
    return df

def edits_from_recordings_changed_to(artist_name):
    sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 72
  AND artist.id = CAST(edit_data.data->'new'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    return df

def edits_from_recordings_merge(artist_name):
    return
    sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 74
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df['date'] = df['date'].apply(lambda d: d.date())
    return df

#artist_name='Páll Ísólfsson'
artist_name='Claude Debussy'
df1 = edits_from_new_medium(artist_name)
df2 = edits_from_standalone_recordings(artist_name)
df3 = edits_from_recordings_changed_from(artist_name)
#df4 = edits_from_recordings_changed_to(artist_name)

df2

%matplotlib inline
#from matplotlib import pyplot
#pyplot.ion()
#fig = pyplot.figure()
#ax = fig.add_subplot(111)
df1.plot.bar()
df2.plot.bar()
df3.plot.bar()
#df4.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f36cd50ba90>

	edit_date	edit_count	editor	edit_type	json
0	2013-05-28	1	maggo49	1	{'name': 'Páll Ísólfsson', 'comment': 'Iceland...
1	2013-05-28	1	maggo49	51	{'release': {'id': 1294643, 'name': 'Píanó'}, ...
2	2015-12-07	1	ProfChris	90	{'ended': 0, 'type0': 'artist', 'type1': 'reco...
3	2015-12-07	1	ProfChris	90	{'ended': 0, 'type0': 'artist', 'type1': 'reco...
4	2015-12-07	1	ProfChris	90	{'ended': 0, 'type0': 'artist', 'type1': 'reco...

	edit_type
edit_date
2013-05-28	2
2015-12-07	9
2016-02-28	1
2016-03-04	8
2016-03-13	21
2016-03-14	6
2016-03-15	45
2016-04-16	3
2016-09-25	3
2016-10-13	6
2016-12-07	1
2016-12-12	1