Visualizing MusicBrainz data evolution

I showed in previous notebooks how to analyze data from the MusicBrainz database, regarding composers, works, performers... now I'd like to look at metadata: the way the data in musicbrainz changed with time, i.e. how new edits modify the

In [1]:
%run startup.ipy
Last notebook update: 2018-06-06
Git repo: git@bitbucket.org:loujine/musicbrainz-dataviz.git
Importing libs
Defining database parameters

Defining *sql* helper function
Last database update: 2018-06-02

Python packages versions:
numpy       1.14.3
pandas      0.23.0
sqlalchemy  1.2.8
CPython 3.7.0b5
IPython 6.4.0

Editors

  • activity curve
  • failed edits by date

Composer

  • activity curve
  • edits grouped by editor, edit type, date

userscripts

  • number of editors
  • number of edits
  • BNF links
  • wikidata links

Edit history of a composer

In [2]:
artist_name = 'Páll Ísólfsson'
In [3]:
def edits_for_artist(artist_name):
    df = sql("""
SELECT edit.open_time AS edit_date, 
       1 AS edit_count,
       editor.name AS editor,
       edit.type AS edit_type,
       edit_data.data AS json
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
  JOIN editor ON editor.id = edit.editor
 WHERE artist.name = %(artist_name)s
""", artist_name=artist_name)
    df['edit_date'] = df['edit_date'].apply(lambda d: d.date())
    return df

edits = edits_for_artist(artist_name)
edits.head()
Out[3]:
edit_date edit_count editor edit_type json
0 2013-05-28 1 maggo49 1 {'name': 'Páll Ísólfsson', 'comment': 'Iceland...
1 2013-05-28 1 maggo49 51 {'release': {'id': 1294643, 'name': 'Píanó'}, ...
2 2015-12-07 1 ProfChris 90 {'ended': 0, 'type0': 'artist', 'type1': 'reco...
3 2015-12-07 1 ProfChris 90 {'ended': 0, 'type0': 'artist', 'type1': 'reco...
4 2015-12-07 1 ProfChris 90 {'ended': 0, 'type0': 'artist', 'type1': 'reco...
In [4]:
set(edits.edit_type)
Out[4]:
{1, 2, 9, 20, 21, 31, 32, 34, 35, 37, 42, 51, 52, 72, 90, 91, 314}
In [5]:
df = edits
df[['edit_date', 'edit_type']].groupby('edit_date').count()
Out[5]:
edit_type
edit_date
2013-05-28 2
2015-12-07 9
2016-02-28 1
2016-03-04 8
2016-03-13 21
2016-03-14 6
2016-03-15 45
2016-04-16 3
2016-09-25 3
2016-10-13 6
2016-12-07 1
2016-12-12 1
In [6]:
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
#print(edit_frequency)
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})
In [7]:
edit_frequency = df.groupby(['edit_date', 'edit_type']).sum()
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})
In [8]:
#edits = edits_for_artist('Johann Sebastian Bach')
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})
In [9]:
from collections import Counter
Counter(edits.edit_type)
Out[9]:
Counter({1: 1,
         51: 5,
         90: 59,
         52: 8,
         72: 8,
         2: 3,
         20: 3,
         31: 3,
         34: 3,
         37: 1,
         35: 2,
         21: 1,
         32: 2,
         42: 4,
         314: 1,
         91: 1,
         9: 1})
In [10]:
# new recordings
#pprint(edits[edits.edit_type == 51].json.values[0])

# recordings changed artist
pprint(edits[edits.edit_type == 72].json.values[0])
{'entity': {'gid': 'b6e0629c-2900-4470-90bd-fd757cc2b396',
            'id': 14958450,
            'name': 'Three pianopieces, op. 5 - Capriccio'},
 'new': {'artist_credit': {'names': [{'artist': {'id': 1011746,
                                                 'name': 'Gísli Magnússon'},
                                      'join_phrase': '',
                                      'name': 'Gísli Magnússon'}]}},
 'old': {'artist_credit': {'names': [{'artist': {'id': 1011747,
                                                 'name': 'Páll Ísólfsson'},
                                      'join_phrase': '',
                                      'name': 'Páll Ísólfsson'}]}}}

 Edit types on work

In [11]:
from collections import Counter

WORKGID = '3308d248-b287-38e9-9718-0119d7a3bf9d'
df = sql("""
SELECT edit.type
--, edit.open_time
--, edit_data.data 
FROM work
JOIN edit_work ON work.id=edit_work.work
JOIN edit ON edit.id=edit_work.edit
JOIN edit_data ON edit_data.edit=edit_work.edit
WHERE work.gid='%s'
-- AND edit.type NOT IN (90,72,51,42, 1, 52, 2, 20, 31, 34, 37, 35, 21, 32, 314,91,9)
-- LIMIT 1
""" % WORKGID)

pprint(Counter(df.type))
Counter({90: 222, 91: 123, 233: 97, 92: 20, 234: 15, 42: 5, 44: 3, 99: 1})
In [12]:
def max_edited_entities(entity_type, min_count=1000):
    assert entity_type in ('work', 'artist')
    return sql(
        """
WITH results AS (
    SELECT COUNT(*) AS edit_count, 
           {entity_type}.name,
           'https://musicbrainz.org/{entity_type}/' || {entity_type}.gid AS url
    FROM {entity_type}
    JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
    WHERE {entity_type}.name NOT IN ('[unknown]', 'Deleted Artist', 'Various Artists')
    GROUP BY {entity_type}.id
)
SELECT * 
FROM results
WHERE edit_count > %(min_count)s
ORDER BY edit_count DESC
LIMIT 100
""".format(entity_type=entity_type), entity_type=entity_type, min_count=min_count)

#fix url
iplot(ff.create_table(max_edited_entities('work', min_count=1000)))
In [13]:
def query_edits(entity_type, entity_mbid):
    assert entity_type in ('work', 'artist')
    template = """
SELECT edit.open_time AS date, 
       1 AS count, 
       edit.type AS type,
       editor.name AS editor,
       edit_data.data AS data
FROM {entity_type}
JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
JOIN edit ON edit.id=edit_{entity_type}.edit
JOIN edit_data ON edit_data.edit=edit_{entity_type}.edit
JOIN editor ON editor.id=edit.editor
WHERE {entity_type}.gid=%(entity_mbid)s
""".format(entity_type=entity_type)
    df = sql(template, entity_mbid=entity_mbid)
    df['date'] = df['date'].apply(lambda d: d.date())
    return df
In [14]:
# Beethoven's Ninth, fourth movement
ninth = query_edits('work', '3761b925-82b2-3a61-9581-57751dee2e9e')
print(ninth.head())
         date  count  type     editor  \
0  2010-08-08      1   234  gswanjord   
1  2010-08-10      1   233  gswanjord   
2  2010-08-10      1   233  gswanjord   
3  2010-08-10      1   233  gswanjord   
4  2010-08-10      1   233  gswanjord   

                                                data  
0  {'new': {'links': [{'entity0_id': '1021', 'ent...  
1  {'links': [{'entity0_id': '1021', 'entity1_id'...  
2  {'links': [{'entity0_id': '1021', 'entity1_id'...  
3  {'links': [{'entity0_id': '1021', 'entity1_id'...  
4  {'links': [{'entity0_id': '1021', 'entity1_id'...  
In [15]:
ninth.shape
Out[15]:
(1242, 5)

composer history: nb of recordings vs. edit date events: new recording change recording artist merge recording delete recording

test

In [16]:
artist_name='Páll Ísólfsson'
#artist_name='Claude Debussy'

EDIT_RECORDING_CREATE = 71
EDIT_RECORDING_EDIT = 72
EDIT_RECORDING_DELETE = 73
EDIT_RECORDING_MERGE = 74

def edits_from_new_medium(artist_name):
    artist_id = sql("SELECT id FROM artist WHERE name=%(artist_name)s", 
                    artist_name=artist_name).id[0]
    df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
       edit_data.data->'tracklist' AS tracklist
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
   AND edit.type = 51
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    df['cnt'] = df.tracklist.apply(lambda tl: [t['artist_credit']['names'][0]['artist']['id'] 
                                               for t in tl].count(artist_id))
    del df['tracklist']
    return df

def edits_from_standalone_recordings(artist_name):
    df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
   AND edit.type = 71
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    return df

def edits_from_recordings_changed_from(artist_name):
    df = sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 72
  AND artist.id = CAST(edit_data.data->'old'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    df['cnt'] = - df['cnt']
    return df

def edits_from_recordings_changed_to(artist_name):
    sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 72
  AND artist.id = CAST(edit_data.data->'new'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    return df

def edits_from_recordings_merge(artist_name):
    return
    sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 74
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df['date'] = df['date'].apply(lambda d: d.date())
    return df
In [17]:
#artist_name='Páll Ísólfsson'
artist_name='Claude Debussy'
df1 = edits_from_new_medium(artist_name)
df2 = edits_from_standalone_recordings(artist_name)
df3 = edits_from_recordings_changed_from(artist_name)
#df4 = edits_from_recordings_changed_to(artist_name)
In [18]:
df2
Out[18]:
cnt
date
2017-10-04 1
In [20]:
%matplotlib inline
#from matplotlib import pyplot
#pyplot.ion()
#fig = pyplot.figure()
#ax = fig.add_subplot(111)
df1.plot.bar()
df2.plot.bar()
df3.plot.bar()
#df4.plot.bar()
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f36cd50ba90>