I showed in previous notebooks how to analyze data from the MusicBrainz database, regarding composers, works, performers... now I'd like to look at metadata: the way the data in musicbrainz changed with time, i.e. how new edits modify the
%run startup.ipy
Editors
Composer
userscripts
artist_name = 'Páll Ísólfsson'
def edits_for_artist(artist_name):
    df = sql("""
SELECT edit.open_time AS edit_date, 
       1 AS edit_count,
       editor.name AS editor,
       edit.type AS edit_type,
       edit_data.data AS json
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
  JOIN editor ON editor.id = edit.editor
 WHERE artist.name = %(artist_name)s
""", artist_name=artist_name)
    df['edit_date'] = df['edit_date'].apply(lambda d: d.date())
    return df
edits = edits_for_artist(artist_name)
edits.head()
set(edits.edit_type)
df = edits
df[['edit_date', 'edit_type']].groupby('edit_date').count()
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
#print(edit_frequency)
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})
edit_frequency = df.groupby(['edit_date', 'edit_type']).sum()
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})
#edits = edits_for_artist('Johann Sebastian Bach')
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
iplot({
    'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
    'layout': {'title': "Number of edits per date"}
})
from collections import Counter
Counter(edits.edit_type)
# new recordings
#pprint(edits[edits.edit_type == 51].json.values[0])
# recordings changed artist
pprint(edits[edits.edit_type == 72].json.values[0])
from collections import Counter
WORKGID = '3308d248-b287-38e9-9718-0119d7a3bf9d'
df = sql("""
SELECT edit.type
--, edit.open_time
--, edit_data.data 
FROM work
JOIN edit_work ON work.id=edit_work.work
JOIN edit ON edit.id=edit_work.edit
JOIN edit_data ON edit_data.edit=edit_work.edit
WHERE work.gid='%s'
-- AND edit.type NOT IN (90,72,51,42, 1, 52, 2, 20, 31, 34, 37, 35, 21, 32, 314,91,9)
-- LIMIT 1
""" % WORKGID)
pprint(Counter(df.type))
def max_edited_entities(entity_type, min_count=1000):
    assert entity_type in ('work', 'artist')
    return sql(
        """
WITH results AS (
    SELECT COUNT(*) AS edit_count, 
           {entity_type}.name,
           'https://musicbrainz.org/{entity_type}/' || {entity_type}.gid AS url
    FROM {entity_type}
    JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
    WHERE {entity_type}.name NOT IN ('[unknown]', 'Deleted Artist', 'Various Artists')
    GROUP BY {entity_type}.id
)
SELECT * 
FROM results
WHERE edit_count > %(min_count)s
ORDER BY edit_count DESC
LIMIT 100
""".format(entity_type=entity_type), entity_type=entity_type, min_count=min_count)
#fix url
iplot(ff.create_table(max_edited_entities('work', min_count=1000)))
def query_edits(entity_type, entity_mbid):
    assert entity_type in ('work', 'artist')
    template = """
SELECT edit.open_time AS date, 
       1 AS count, 
       edit.type AS type,
       editor.name AS editor,
       edit_data.data AS data
FROM {entity_type}
JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
JOIN edit ON edit.id=edit_{entity_type}.edit
JOIN edit_data ON edit_data.edit=edit_{entity_type}.edit
JOIN editor ON editor.id=edit.editor
WHERE {entity_type}.gid=%(entity_mbid)s
""".format(entity_type=entity_type)
    df = sql(template, entity_mbid=entity_mbid)
    df['date'] = df['date'].apply(lambda d: d.date())
    return df
# Beethoven's Ninth, fourth movement
ninth = query_edits('work', '3761b925-82b2-3a61-9581-57751dee2e9e')
print(ninth.head())
ninth.shape
composer history: nb of recordings vs. edit date events: new recording change recording artist merge recording delete recording
artist_name='Páll Ísólfsson'
#artist_name='Claude Debussy'
EDIT_RECORDING_CREATE = 71
EDIT_RECORDING_EDIT = 72
EDIT_RECORDING_DELETE = 73
EDIT_RECORDING_MERGE = 74
def edits_from_new_medium(artist_name):
    artist_id = sql("SELECT id FROM artist WHERE name=%(artist_name)s", 
                    artist_name=artist_name).id[0]
    df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
       edit_data.data->'tracklist' AS tracklist
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
   AND edit.type = 51
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    df['cnt'] = df.tracklist.apply(lambda tl: [t['artist_credit']['names'][0]['artist']['id'] 
                                               for t in tl].count(artist_id))
    del df['tracklist']
    return df
def edits_from_standalone_recordings(artist_name):
    df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
   AND edit.type = 71
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    return df
def edits_from_recordings_changed_from(artist_name):
    df = sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 72
  AND artist.id = CAST(edit_data.data->'old'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    df['cnt'] = - df['cnt']
    return df
def edits_from_recordings_changed_to(artist_name):
    sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 72
  AND artist.id = CAST(edit_data.data->'new'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df.index = df['date'].apply(lambda d: d.date())
    del df['date']
    return df
def edits_from_recordings_merge(artist_name):
    return
    sql("""    
SELECT date_trunc('day', edit.open_time) AS date,
       COUNT(*) AS cnt
  FROM artist
  JOIN edit_artist ON artist.id = edit_artist.artist
  JOIN edit        ON edit.id = edit_artist.edit
  JOIN edit_data   ON edit_data.edit = edit_artist.edit
 WHERE artist.name = %(artist_name)s
    AND edit.type = 74
GROUP BY date
ORDER BY date
    ;""", artist_name=artist_name)
    df['date'] = df['date'].apply(lambda d: d.date())
    return df
#artist_name='Páll Ísólfsson'
artist_name='Claude Debussy'
df1 = edits_from_new_medium(artist_name)
df2 = edits_from_standalone_recordings(artist_name)
df3 = edits_from_recordings_changed_from(artist_name)
#df4 = edits_from_recordings_changed_to(artist_name)
df2
%matplotlib inline
#from matplotlib import pyplot
#pyplot.ion()
#fig = pyplot.figure()
#ax = fig.add_subplot(111)
df1.plot.bar()
df2.plot.bar()
df3.plot.bar()
#df4.plot.bar()