The classical music community on MusicBrainz decided to concentrate on one composer and do a monthly cleanup of Debussy data in the database.
Here is the state of the database early September 2017 after the cleanup
composer_name = 'Claude Debussy'
composer_mbid = 'be50643c-0377-4968-b48c-47e06b2e2a3b'
%run startup.ipy
import datetime as dt
def to_unix_time(datestring):
epoch = dt.datetime(1970, 1, 1)
return (dt.datetime.strptime(datestring, '%Y-%m-%d') - epoch).total_seconds() * 1000
def work_list(composer_name):
composer = sql("""
SELECT gid AS mbid,
name
FROM artist
WHERE name = %(composer_name)s;
""", composer_name=composer_name)
works = sql("""
SELECT w.name AS work_name,
lt.text_value AS catalog_no,
s.name AS catalog,
s.id AS catalog_id,
wt.name AS work_type,
la.name AS language,
watav.value AS key,
to_date(to_char(l.begin_date_year, '9999') ||
to_char(COALESCE(l.end_date_month, 1), '99') ||
to_char(COALESCE(l.end_date_day, 1), '99'), 'YYYY MM DD') AS start,
to_date(to_char(l.end_date_year, '9999') ||
to_char(COALESCE(l.end_date_month, 1), '99') ||
to_char(COALESCE(l.end_date_day, 1), '99'), 'YYYY MM DD') AS end,
w.gid AS mbid
FROM work AS w
JOIN l_artist_work AS law ON w.id = law.entity1
JOIN artist AS a ON a.id = law.entity0
JOIN link AS l ON l.id = law.link
LEFT OUTER JOIN l_series_work AS lsw ON w.id = lsw.entity1
LEFT OUTER JOIN series AS s ON s.id = lsw.entity0
LEFT OUTER JOIN link_attribute_text_value AS lt ON lt.link = lsw.link
LEFT OUTER JOIN work_language AS wl ON wl.work = w.id
LEFT OUTER JOIN language AS la ON la.id = wl.language
LEFT OUTER JOIN work_type AS wt ON w.type = wt.id
LEFT OUTER JOIN work_attribute AS wa ON wa.work = w.id
LEFT OUTER JOIN work_attribute_type AS wat ON wat.id = wa.work_attribute_type
LEFT OUTER JOIN work_attribute_type_allowed_value AS watav ON watav.id = wa.work_attribute_type_allowed_value
WHERE a.gid = %(composer_mbid)s
ORDER BY start, work_name;
""", composer_mbid=composer.mbid[0])
works['url'] = works.mbid.apply(mb_work_link)
return works
works = work_list('Claude Debussy')
print('Number of works: {}'.format(works.shape[0]))
print('10 first works:')
iplot(ff.create_table(works[['start', 'work_name', 'url']].head(10)))
composer = sql("""
SELECT gid AS mbid,
name
FROM artist
WHERE name = %(composer_name)s;
""", composer_name='Claude Debussy')
#print(composer)
catno = sql("""
SELECT w.name AS work_name,
lt.text_value AS catalog_no,
url AS imslp
FROM work AS w
JOIN l_artist_work AS law ON w.id = law.entity1
JOIN artist AS a ON a.id = law.entity0
JOIN link AS l ON l.id = law.link
JOIN l_series_work AS lsw ON w.id = lsw.entity1
JOIN series AS s ON s.id = lsw.entity0
JOIN link_attribute_text_value AS lt ON lt.link = lsw.link
LEFT OUTER JOIN l_url_work AS luw ON w.id = luw.entity1
LEFT OUTER JOIN url AS u ON u.id = luw.entity0
WHERE a.gid = %(composer_mbid)s
-- AND s.gid = '8d45008f-a609-4b97-a038-eaf946b2533f'
AND s.gid = 'ce1cbe59-d4e1-4d70-b0be-3c0131cddb15'
AND url ILIKE '%%imslp%%'
ORDER BY catalog_no;
""", composer_mbid=composer.mbid[0])
catno.index = catno.catalog_no
catno.drop('catalog_no', axis=1, inplace=True)
catno.head()
#iplot(ff.create_table(works[['start', 'work_name', 'url']].head(10)))
imslp = pandas.read_html('http://imslp.org/wiki/List_of_works_by_Claude_Debussy', header=0, index_col=0)[0]
imslp.head()
works.head()
compar = imslp.copy()
compar['mbid'] = None
for idx, row in works.iterrows():
if row.catalog_no and row.catalog_no.startswith('L'):
try:
compar.loc[row.catalog_no.replace('. ', '')].mbid = row.mbid
except:
print('Lesure catalog in MB not found in IMSLP list: {0.catalog_no} (mbid {0.mbid})'.format(row))
pass
print('Lesure catalog entries in IMSLP that do not have a mbid:')
compar[compar.mbid.isnull()]
Should contain only recordings where Debussy was a performer (pianist)
sql("""
SELECT COUNT(*)
FROM recording AS r
JOIN l_artist_recording AS lar ON r.id = lar.entity1
JOIN artist AS a ON a.id = lar.entity0
WHERE a.gid = %(composer_mbid)s
""", composer_mbid=composer_mbid)
The "composer" relation on releases is deprecated
sql("""
SELECT COUNT(*)
FROM release AS r
JOIN l_artist_release AS lar ON r.id = lar.entity1
JOIN artist AS a ON a.id = lar.entity0
WHERE a.gid = %(composer_mbid)s
""", composer_mbid=composer_mbid)
sql("""
SELECT COUNT(*)
-- rg.name AS album,
-- ac.name AS artists,
-- rg.gid AS mbid
FROM artist AS a
JOIN artist_credit_name AS acn ON a.id = acn.artist
JOIN artist_credit AS ac ON ac.id = acn.artist_credit
JOIN release_group AS rg ON rg.artist_credit = ac.id
-- JOIN release AS r ON r.release_group = rg.id
-- JOIN release_event AS ev ON ev.release = r.id
WHERE a.gid = %(composer_mbid)s
;""", composer_mbid=composer_mbid)
Should contain only recordings where Debussy was a performer (pianist)
sql("""
SELECT COUNT(*)
FROM artist AS a
JOIN artist_credit_name AS acn ON a.id = acn.artist
JOIN artist_credit AS ac ON ac.id = acn.artist_credit
JOIN recording AS r ON r.artist_credit = ac.id
WHERE a.gid = %(composer_mbid)s
;""", composer_mbid=composer_mbid)
df = sql("""
SELECT date_trunc('month', edit.open_time) AS date,
editor.name AS editor,
COUNT(*) AS edit_count
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.gid = %(composer_mbid)s
-- AND edit.open_time >= '2017-07-01'
GROUP BY date, editor.name
ORDER BY date, edit_count DESC
;""", composer_mbid=composer_mbid)
df.date = df.date.apply(lambda x: x.date())
#df.index =df.date.apply(lambda x: x.date())
#df.drop('date', axis=1, inplace=True)
df.head()
print('Total numbers of edits regarding Debussy in history: {}'.format(df.edit_count.sum()))
edit_frequency = df[['date', 'edit_count']].groupby('date').sum()
print(edit_frequency.tail(10))
iplot({
'data': [dict(type='bar', x=edit_frequency.index, y=edit_frequency.edit_count)],
'layout': {'title': "Number of edits per month for {}".format(composer_name),
'xaxis': {'title': 'Date',
'range': [to_unix_time('2012-10-01'), to_unix_time('2018-01-01')]},
'yaxis': {'title': 'Number of edits'},
}
})
Not very surpringly, the monthly peak was obtained during the community cleanup: nearly 11000 edits! Some people started working in July when the sprint was announced which explained the high number (more than 4500 edits)
Edits by editor and day (limited to editors with at least 500 edits on Debussy)
pt = df.pivot_table('edit_count', index='date', columns='editor', aggfunc='sum').fillna(0)
pt = pt.loc[:, lambda pt: pt.sum(0) > 500]
pt.head()
import datetime as dt
iplot({
'data': [{'x': pt.index, 'y': pt[field],
'name': field, 'type': 'bar'} for field in pt.columns],
'layout': {'title': "Number of edits per month per editor",
'xaxis': {'title': 'Date',
'range': [to_unix_time('2017-06-01'), to_unix_time('2017-10-31')]},
'yaxis': {'title': 'Number of edits',
'range': [0, 6500]}
}
})
Congrats stupidname and ListMyCDs.com!
pt.index = pandas.to_datetime(pt.index)
pt['2017-07-01':'2017-11-01'].sum().sort_values(ascending=False).head(10)