I showed in previous notebooks how to analyze data from the MusicBrainz database, regarding composers, works, performers... now I'd like to look at metadata: the way the data in musicbrainz changed with time, i.e. how new edits modify the
%run startup.ipy
Editors
Composer
userscripts
artist_name = 'Páll Ísólfsson'
def edits_for_artist(artist_name):
df = sql("""
SELECT edit.open_time AS edit_date,
1 AS edit_count,
editor.name AS editor,
edit.type AS edit_type,
edit_data.data AS json
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.name = %(artist_name)s
""", artist_name=artist_name)
df['edit_date'] = df['edit_date'].apply(lambda d: d.date())
return df
edits = edits_for_artist(artist_name)
edits.head()
set(edits.edit_type)
df = edits
df[['edit_date', 'edit_type']].groupby('edit_date').count()
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
#print(edit_frequency)
iplot({
'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
'layout': {'title': "Number of edits per date"}
})
edit_frequency = df.groupby(['edit_date', 'edit_type']).sum()
iplot({
'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
'layout': {'title': "Number of edits per date"}
})
#edits = edits_for_artist('Johann Sebastian Bach')
edit_frequency = df[['edit_date', 'edit_count']].groupby('edit_date').sum()
iplot({
'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
'layout': {'title': "Number of edits per date"}
})
from collections import Counter
Counter(edits.edit_type)
# new recordings
#pprint(edits[edits.edit_type == 51].json.values[0])
# recordings changed artist
pprint(edits[edits.edit_type == 72].json.values[0])
from collections import Counter
WORKGID = '3308d248-b287-38e9-9718-0119d7a3bf9d'
df = sql("""
SELECT edit.type
--, edit.open_time
--, edit_data.data
FROM work
JOIN edit_work ON work.id=edit_work.work
JOIN edit ON edit.id=edit_work.edit
JOIN edit_data ON edit_data.edit=edit_work.edit
WHERE work.gid='%s'
-- AND edit.type NOT IN (90,72,51,42, 1, 52, 2, 20, 31, 34, 37, 35, 21, 32, 314,91,9)
-- LIMIT 1
""" % WORKGID)
pprint(Counter(df.type))
def max_edited_entities(entity_type, min_count=1000):
assert entity_type in ('work', 'artist')
return sql(
"""
WITH results AS (
SELECT COUNT(*) AS edit_count,
{entity_type}.name,
'https://musicbrainz.org/{entity_type}/' || {entity_type}.gid AS url
FROM {entity_type}
JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
WHERE {entity_type}.name NOT IN ('[unknown]', 'Deleted Artist', 'Various Artists')
GROUP BY {entity_type}.id
)
SELECT *
FROM results
WHERE edit_count > %(min_count)s
ORDER BY edit_count DESC
LIMIT 100
""".format(entity_type=entity_type), entity_type=entity_type, min_count=min_count)
#fix url
iplot(ff.create_table(max_edited_entities('work', min_count=1000)))
def query_edits(entity_type, entity_mbid):
assert entity_type in ('work', 'artist')
template = """
SELECT edit.open_time AS date,
1 AS count,
edit.type AS type,
editor.name AS editor,
edit_data.data AS data
FROM {entity_type}
JOIN edit_{entity_type} ON {entity_type}.id=edit_{entity_type}.{entity_type}
JOIN edit ON edit.id=edit_{entity_type}.edit
JOIN edit_data ON edit_data.edit=edit_{entity_type}.edit
JOIN editor ON editor.id=edit.editor
WHERE {entity_type}.gid=%(entity_mbid)s
""".format(entity_type=entity_type)
df = sql(template, entity_mbid=entity_mbid)
df['date'] = df['date'].apply(lambda d: d.date())
return df
# Beethoven's Ninth, fourth movement
ninth = query_edits('work', '3761b925-82b2-3a61-9581-57751dee2e9e')
print(ninth.head())
ninth.shape
composer history: nb of recordings vs. edit date events: new recording change recording artist merge recording delete recording
artist_name='Páll Ísólfsson'
#artist_name='Claude Debussy'
EDIT_RECORDING_CREATE = 71
EDIT_RECORDING_EDIT = 72
EDIT_RECORDING_DELETE = 73
EDIT_RECORDING_MERGE = 74
def edits_from_new_medium(artist_name):
artist_id = sql("SELECT id FROM artist WHERE name=%(artist_name)s",
artist_name=artist_name).id[0]
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
edit_data.data->'tracklist' AS tracklist
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
WHERE artist.name = %(artist_name)s
AND edit.type = 51
;""", artist_name=artist_name)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
df['cnt'] = df.tracklist.apply(lambda tl: [t['artist_credit']['names'][0]['artist']['id']
for t in tl].count(artist_id))
del df['tracklist']
return df
def edits_from_standalone_recordings(artist_name):
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
COUNT(*) AS cnt
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
WHERE artist.name = %(artist_name)s
AND edit.type = 71
GROUP BY date
ORDER BY date
;""", artist_name=artist_name)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
return df
def edits_from_recordings_changed_from(artist_name):
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
COUNT(*) AS cnt
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
WHERE artist.name = %(artist_name)s
AND edit.type = 72
AND artist.id = CAST(edit_data.data->'old'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
;""", artist_name=artist_name)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
df['cnt'] = - df['cnt']
return df
def edits_from_recordings_changed_to(artist_name):
sql("""
SELECT date_trunc('day', edit.open_time) AS date,
COUNT(*) AS cnt
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
WHERE artist.name = %(artist_name)s
AND edit.type = 72
AND artist.id = CAST(edit_data.data->'new'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
GROUP BY date
ORDER BY date
;""", artist_name=artist_name)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
return df
def edits_from_recordings_merge(artist_name):
return
sql("""
SELECT date_trunc('day', edit.open_time) AS date,
COUNT(*) AS cnt
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
WHERE artist.name = %(artist_name)s
AND edit.type = 74
GROUP BY date
ORDER BY date
;""", artist_name=artist_name)
df['date'] = df['date'].apply(lambda d: d.date())
return df
#artist_name='Páll Ísólfsson'
artist_name='Claude Debussy'
df1 = edits_from_new_medium(artist_name)
df2 = edits_from_standalone_recordings(artist_name)
df3 = edits_from_recordings_changed_from(artist_name)
#df4 = edits_from_recordings_changed_to(artist_name)
df2
%matplotlib inline
#from matplotlib import pyplot
#pyplot.ion()
#fig = pyplot.figure()
#ax = fig.add_subplot(111)
df1.plot.bar()
df2.plot.bar()
df3.plot.bar()
#df4.plot.bar()