The classical music community on MusicBrainz decided to concentrate on one composer and do a monthly cleanup of Debussy data in the database.
Here is the state of the database early September 2017 before the cleanup
%run startup.ipy
composer_mbid = '8d610e51-64b4-4654-b8df-064b0fb7a9d9'
composer_name = 'Gustav Mahler'
def work_list(composer_name):
composer = sql("""
SELECT gid AS mbid,
name
FROM artist
WHERE name = %(composer_name)s;
""", composer_name=composer_name)
works = sql("""
SELECT w.name AS work_name,
lt.text_value AS catalog_no,
s.name AS catalog,
s.id AS catalog_id,
wt.name AS work_type,
la.name AS language,
watav.value AS key,
to_date(to_char(l.begin_date_year, '9999') ||
to_char(COALESCE(l.end_date_month, 1), '99') ||
to_char(COALESCE(l.end_date_day, 1), '99'), 'YYYY MM DD') AS start,
to_date(to_char(l.end_date_year, '9999') ||
to_char(COALESCE(l.end_date_month, 1), '99') ||
to_char(COALESCE(l.end_date_day, 1), '99'), 'YYYY MM DD') AS end,
w.gid AS mbid
FROM work AS w
JOIN l_artist_work AS law ON w.id = law.entity1
JOIN artist AS a ON a.id = law.entity0
JOIN link AS l ON l.id = law.link
LEFT OUTER JOIN l_series_work AS lsw ON w.id = lsw.entity1
LEFT OUTER JOIN series AS s ON s.id = lsw.entity0
LEFT OUTER JOIN link_attribute_text_value AS lt ON lt.link = lsw.link
LEFT OUTER JOIN work_language AS wl ON wl.work = w.id
LEFT OUTER JOIN language AS la ON la.id = wl.language
LEFT OUTER JOIN work_type AS wt ON w.type = wt.id
LEFT OUTER JOIN work_attribute AS wa ON wa.work = w.id
LEFT OUTER JOIN work_attribute_type AS wat ON wat.id = wa.work_attribute_type
LEFT OUTER JOIN work_attribute_type_allowed_value AS watav ON watav.id = wa.work_attribute_type_allowed_value
WHERE a.gid = %(composer_mbid)s
ORDER BY start, work_name;
""", composer_mbid=composer.mbid[0])
works['url'] = works.mbid.apply(mb_work_link)
return works
works = work_list(composer_name)
print('Number of works: {}'.format(works.shape[0]))
print('10 first works:')
iplot(ff.create_table(works[['start', 'work_name', 'url']].head(10)))
imslp = pandas.read_html('http://imslp.org/wiki/List_of_works_by_Gustav_Mahler', header=0, index_col=0)[0]
imslp.head()
Should contain only recordings where Mahler was a performer (conductor)
sql("""
SELECT COUNT(*)
FROM recording AS r
JOIN l_artist_recording AS lar ON r.id = lar.entity1
JOIN artist AS a ON a.id = lar.entity0
WHERE a.gid = %(composer_mbid)s
""", composer_mbid=composer_mbid)
The "composer" relation on releases is deprecated
sql("""
SELECT COUNT(*)
FROM release AS r
JOIN l_artist_release AS lar ON r.id = lar.entity1
JOIN artist AS a ON a.id = lar.entity0
WHERE a.gid = %(composer_mbid)s
""", composer_mbid=composer_mbid)
sql("""
SELECT COUNT(*)
-- rg.name AS album,
-- ac.name AS artists,
-- rg.gid AS mbid
FROM artist AS a
JOIN artist_credit_name AS acn ON a.id = acn.artist
JOIN artist_credit AS ac ON ac.id = acn.artist_credit
JOIN release_group AS rg ON rg.artist_credit = ac.id
-- JOIN release AS r ON r.release_group = rg.id
-- JOIN release_event AS ev ON ev.release = r.id
WHERE a.gid = %(composer_mbid)s
;""", composer_mbid=composer_mbid)
Should contain only recordings where Mahler was a performer (composer)
sql("""
SELECT COUNT(*)
FROM artist AS a
JOIN artist_credit_name AS acn ON a.id = acn.artist
JOIN artist_credit AS ac ON ac.id = acn.artist_credit
JOIN recording AS r ON r.artist_credit = ac.id
WHERE a.gid = %(composer_mbid)s
;""", composer_mbid=composer_mbid)
df = sql("""
SELECT edit.open_time AS date,
1 AS edit_count,
editor.name AS editor
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.gid = %(composer_mbid)s
;""", composer_mbid=composer_mbid)
df.date = df.date.apply(lambda x: x.date())
df.head()
print('Total numbers of edits regarding {} in history: {}'.format(composer_name, len(df)))
edit_frequency = df[['date', 'edit_count']].groupby('date').sum()
edit_frequency.index = pandas.to_datetime(edit_frequency.index)
edit_frequency = edit_frequency.resample('M').sum()
print(edit_frequency.tail(10))
iplot({
'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
'layout': {'title': "Number of edits per month for {}".format(composer_name),
'xaxis': {'title': 'Date',
'range': [to_unix_time('2012-10-01'), to_unix_time('2018-01-01')]},
'yaxis': {'title': 'Number of edits'},
}
})
Edits by editor and day (limited to editors with at least 500 edits on Mahler)
pt = df.pivot_table('edit_count', index='date', columns='editor', aggfunc='sum').fillna(0)
pt = pt.loc[:, lambda pt: pt.sum(0) > 500]
iplot({
'data': [{'x': pt.index, 'y': pt[field],
'name': field, 'type': 'bar'} for field in pt.columns],
'layout': {'title': "Number of edits per day per editor",
'xaxis': {'title': 'Date',
'range': [to_unix_time('2017-09-01'), to_unix_time('2017-10-31')]},
'yaxis': {'title': 'Number of edits',
'range': [0, 1500]}
}
})