%run -i ../startup.py
ENTITY_TYPE = 'release-group'
Last notebook update: 2021-01-31
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13
Defining *sparql* helper function
Wikidata entities:
https://www.wikidata.org/wiki/Q482994 album
Wikidata properties:
https://www.wikidata.org/wiki/Property:P175 performer
https://www.wikidata.org/wiki/Property:P214 VIAF
https://www.wikidata.org/wiki/Property:P268 BNF
https://www.wikidata.org/wiki/Property:P244 LoC
https://www.wikidata.org/wiki/Property:P1954 discogs master ID
https://www.wikidata.org/wiki/Property:P436 MusicBrainz RG ID
examples
https://www.wikidata.org/wiki/Q7713309
links_type_from_wd = sparql("""
SELECT distinct (count(?rg) as ?cnt) ?ins ?insLabel
WHERE {
?rg wdt:P31 ?ins;
wdt:P436 ?mbid.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?ins ?insLabel
order by DESC(?cnt)
""")
links_type_from_wd[links_type_from_wd.cnt.astype(int) > 50]
cnt | ins | insLabel | |
---|---|---|---|
0 | 112430 | Q482994 | album |
1 | 26516 | Q134556 | single |
2 | 8615 | Q222910 | compilation album |
3 | 8315 | Q208569 | studio album |
4 | 7049 | Q209939 | live album |
5 | 6706 | Q169930 | extended play |
6 | 3295 | Q7366 | song |
7 | 942 | Q11424 | film |
8 | 658 | Q4176708 | soundtrack album |
9 | 653 | Q963099 | remix album |
10 | 349 | Q7889 | video game |
11 | 209 | Q20737336 | collaborative album |
12 | 176 | Q20089094 | live video album |
13 | 174 | Q723849 | greatest hits album |
14 | 171 | Q2743 | musical theatre |
15 | 165 | Q394970 | box set |
16 | 121 | Q2619673 | DJ mix |
17 | 110 | Q1242743 | double album |
18 | 110 | Q1892995 | mixtape |
19 | 91 | Q368281 | split album |
20 | 86 | Q217199 | soundtrack |
21 | 85 | Q2068728 | hit record |
22 | 83 | Q207628 | musical composition |
23 | 81 | Q10590726 | video album |
24 | 76 | Q7302866 | audio track |
25 | 66 | Q55873388 | Christmas-themed album |
26 | 63 | Q7725634 | literary work |
27 | 58 | Q47461344 | written work |
28 | 57 | Q59854802 | live extended play |
29 | 56 | Q29652773 | film soundtrack |
# linked to MB RG
links_from_wd = sparql("""
SELECT distinct (count(?rg) as ?cnt)
WHERE {
?rg wdt:P436 ?mbid .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?rgLabel)
""")
links_from_wd
cnt | |
---|---|
0 | 174974 |
That’s too many to be fetched. Try the artists with a discogs link
links_from_wd = sparql("""
SELECT distinct (count(?rg) as ?cnt)
WHERE {
?rg wdt:P436 ?mbid .
?rg wdt:P1954 ?discogs .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""")
links_from_wd
cnt | |
---|---|
0 | 95963 |
links_from_wd = sparql("""
SELECT (?rg AS ?wd) ?mbid ?rgLabel ?discogs
WHERE {
?rg wdt:P436 ?mbid .
?rg wdt:P1954 ?discogs .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?rgLabel)
""")
links_from_wd.rename(columns={'rgLabel': 'name'}, inplace=True)
print('Count:', len(links_from_wd))
display_df(links_from_wd.head())
Count: 95963
wd | mbid | name | discogs | |
---|---|---|---|---|
0 | Q100701398 | b88cf8ff-ac40-31a0-8722-cf7c215c27df | Q100701398 | 3088228 |
1 | Q1009874 | 7280e3be-30b9-3b14-ab47-a33448916e09 | Q1009874 | 632312 |
2 | Q100998836 | 0c536afe-dbc4-4af7-8c6d-79899dd813f3 | Q100998836 | 1729268 |
3 | Q1016612 | f19754b3-88d1-3cfe-937e-d28b560e59c5 | Q1016612 | 517422 |
4 | Q102128610 | 818bf59a-3982-4486-97e5-8b0748c71aee | Q102128610 | 1521602 |
links_from_mb = sql("""
SELECT
url.url AS wd,
release_group.gid AS mbid,
release_group.name
FROM
release_group
JOIN l_release_group_url AS lau ON lau.entity0 = release_group.id
JOIN url ON lau.entity1 = url.id
WHERE
url.url LIKE '%%wikidata.org%%'
ORDER BY
release_group.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)
print('Count:', len(links_from_mb))
display_df(links_from_mb.head())
Count: 115750
wd | mbid | name | |
---|---|---|---|
0 | Q66092288 | 46a4376d-7fdd-416a-9b26-f38273e13f76 | ! |
1 | Q2705922 | e2e3f68e-4b21-3296-88f5-39d50f53cc72 | !!! |
2 | Q3596098 | c00126c7-6b6f-3857-8f4e-8e4b2aba635c | !!Destroy‐Oh‐Boy!! |
3 | Q5577828 | 3f762d15-18ba-3883-a7b4-73a0c9f39f46 | !!Going Places!! |
4 | Q5037656 | dcacff75-c1ed-36f9-b139-425cf2612574 | !Caramba! |
duplicate_wd = links_from_mb[[
'wd', 'mbid', 'name']].groupby('wd').filter(
lambda row: len(row.mbid) > 1).sort_values('wd')
print('Count:', len(duplicate_wd))
display_df(duplicate_wd.head())
Count: 940
wd | mbid | name | |
---|---|---|---|
0 | Q1028031 | dcd22f78-aac2-37c6-b55e-96cba448e259 | Driving Home for Christmas / Hello Friend (Re-Recorded) |
1 | Q1028031 | 124f7c14-40f0-3746-9a9b-fdb74fd5387b | Driving Home for Christmas: The Christmas EP |
2 | Q1028055 | 7f697ec7-d422-3690-9788-f837dce3777e | Greatest Hits |
3 | Q1028055 | 5410586d-698c-38cc-b26f-b61577f4c32d | Greatest Hits |
4 | Q10280819 | cb4d06c2-94fc-3e75-9ee7-1d4a9182e044 | Feijoada acidente? - Brasil |
duplicate_mb = links_from_mb[['wd', 'mbid', 'name']].groupby('mbid').filter(
lambda row: len(row.mbid) > 1).sort_values('mbid')
print('Count:', len(duplicate_mb))
display_df(duplicate_mb.head())
Count: 185
wd | mbid | name | |
---|---|---|---|
0 | Q69715896 | 0402ea05-c920-30c3-9559-94a6ebd783a3 | My Generation |
1 | Q69709269 | 0402ea05-c920-30c3-9559-94a6ebd783a3 | My Generation |
2 | Q15971649 | 05c9b920-d313-3220-b997-f8ed16ebeaae | 17: Greatest Hits |
3 | Q565500 | 05c9b920-d313-3220-b997-f8ed16ebeaae | 17: Greatest Hits |
4 | Q9363343 | 0a11885b-3fcf-3d41-9e54-2f16582190af | Zmierzch Bogów |
merge = pd.merge(links_from_wd, links_from_mb,
on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
how='outer', indicator=True)
display_df(merge.head())
wd | mbid | name_wd | discogs | name_mb | _merge | |
---|---|---|---|---|---|---|
0 | Q100701398 | b88cf8ff-ac40-31a0-8722-cf7c215c27df | Q100701398 | 3088228 | NaN | left_only |
1 | Q1009874 | 7280e3be-30b9-3b14-ab47-a33448916e09 | Q1009874 | 632312 | Bunte Scherben | both |
2 | Q100998836 | 0c536afe-dbc4-4af7-8c6d-79899dd813f3 | Q100998836 | 1729268 | NaN | left_only |
3 | Q1016612 | f19754b3-88d1-3cfe-937e-d28b560e59c5 | Q1016612 | 517422 | Burli | both |
4 | Q102128610 | 818bf59a-3982-4486-97e5-8b0748c71aee | Q102128610 | 1521602 | NaN | left_only |
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][[
'name_mb', 'mbid', 'wd']]
print('Count:', len(links_to_add_to_wd))
display_df(links_to_add_to_wd.head())
Count: 51875
name_mb | mbid | wd | |
---|---|---|---|
0 | ! | 46a4376d-7fdd-416a-9b26-f38273e13f76 | Q66092288 |
1 | "... The Truth Is a Fucking Lie..." | 63451c2c-c39d-3d30-b7e2-c6608708db4d | Q13416956 |
2 | "10" | 0cc5b1f4-6df9-35df-becf-cb7e954eb893 | Q184591 |
3 | "77" | 180f5161-23f7-49a0-beec-09555d4a654e | Q11880947 |
4 | "A" e o "Z" | 5baebed3-fcab-30ec-8eda-1eccf5921b22 | Q3285394 |
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][[
'name_wd', 'wd', 'mbid']]
links_to_add_to_mb['edit_link'] = links_to_add_to_mb.apply(
mb_artist_edit_wd_link, axis=1)
print('Count:', len(links_to_add_to_mb))
display_df(links_to_add_to_mb.head())
Count: 32046
name_wd | wd | mbid | edit_link | |
---|---|---|---|---|
0 | Q100701398 | Q100701398 | b88cf8ff-ac40-31a0-8722-cf7c215c27df | edit |
1 | Q100998836 | Q100998836 | 0c536afe-dbc4-4af7-8c6d-79899dd813f3 | edit |
2 | Q102128610 | Q102128610 | 818bf59a-3982-4486-97e5-8b0748c71aee | edit |
3 | Q10217180 | Q10217180 | 307af7b8-67ef-38a4-95df-cc366ef636b6 | edit |
4 | Q102338667 | Q102338667 | d95a18d0-af86-367f-bf31-8b43abc0c824 | edit |
TBD on wd entries with discogs links and no mb link
# linked to Discogs master
discogs_links_from_wd = sparql("""
SELECT (?rg AS ?wd) ?rgLabel ?discogs
WHERE {
?rg wdt:P1954 ?discogs .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
MINUS {
?rg wdt:P436 ?mbid .
}
}
ORDER BY ASC(?rgLabel)
""")
discogs_links_from_wd.rename(columns={'rgLabel': 'name'}, inplace=True)
print('Count:', len(discogs_links_from_wd))
display_df(discogs_links_from_wd.head())
Count: 24444
wd | name | discogs | |
---|---|---|---|
0 | Q100251747 | Q100251747 | 893897 |
1 | Q100300673 | Q100300673 | 1150563 |
2 | Q100316840 | Q100316840 | 6530510 |
3 | Q100342804 | Q100342804 | 1108665 |
4 | Q100378070 | Q100378070 | 275211 |
discogs_links_from_mb = sql("""
SELECT
url.url AS discogs,
release_group.gid AS mbid,
release_group.name
FROM
release_group
JOIN l_release_group_url AS lau ON lau.entity0 = release_group.id
JOIN url ON lau.entity1 = url.id
WHERE
url.url LIKE '%%discogs.com%%'
AND lau.entity0 IN (
SELECT
entity0
FROM
l_release_group_url
JOIN url ON l_release_group_url.entity1 = url.id
WHERE
url.url LIKE '%%discogs.com%%'
EXCEPT
SELECT
entity0
FROM
l_release_group_url
JOIN url ON l_release_group_url.entity1 = url.id
WHERE
url.url LIKE '%%wikidata.org%%'
)
ORDER BY
release_group.name
;
""")
discogs_links_from_mb.discogs = discogs_links_from_mb.discogs.apply(lambda s: s.split('/')[-1])
discogs_links_from_mb.mbid = discogs_links_from_mb.mbid.apply(str)
print('Count:', len(discogs_links_from_mb))
display_df(discogs_links_from_mb.head())
Count: 171673
discogs | mbid | name | |
---|---|---|---|
0 | 1784550 | dae13e8a-0d21-4151-a492-f4ec39394d81 | ! |
1 | 1793473 | 16c05add-e78e-40ee-a809-6b68a5d7d664 | !! |
2 | 427086 | 6f319a03-20ac-33c8-98a0-1ef746e78d04 | !! |
3 | 321441 | 4cd7ca7e-d8fd-3cdb-a9c7-1caef23deab1 | !!!Here Ain't the Sonics!!! |
4 | 136546 | eae71c65-4b63-4ea8-a759-40f190ba67f6 | !Catch the Beat! / !Catch the Groove! |
discogs_merge = pd.merge(discogs_links_from_wd, discogs_links_from_mb,
on=['discogs'], suffixes=('_wd', '_mb'),
how='inner', indicator=False)
discogs_merge['edit_link'] = discogs_merge.apply(
mb_releasegroup_edit_wd_link, axis=1)
print('Count:', len(discogs_merge))
display_df(discogs_merge.head())
Count: 3317
wd | name_wd | discogs | mbid | name_mb | edit_link | |
---|---|---|---|---|---|---|
0 | Q101480010 | Q101480010 | 322472 | 1d27dcca-2980-47bc-b358-a3c038bfb90d | La convenzione / Paranoia | edit |
1 | Q101510523 | Q101510523 | 768022 | 4eba0184-9340-4fde-b1c3-b243bc75a418 | L'era del cinghiale bianco / Luna indiana | edit |
2 | Q102350795 | Q102350795 | 563853 | a974458d-f876-443f-b501-f7aec0e416b8 | Big Big Hits of '62 | edit |
3 | Q10278713 | Q10278713 | 334266 | 42dc40f6-f27c-47ef-8d09-58e941046fa1 | Explode | edit |
4 | Q10282621 | Q10282621 | 293281 | f619677a-e20a-38cd-ae74-09ef523f097d | First / Second | edit |
import jinja2
template = jinja2.Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Alignment of MusicBrainz and Wikidata Release Groups</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Alignment of MusicBrainz and Wikidata Release Groups</h1>
<p>Latest MB database update: </p>
<p>Latest update: </p>
<ol>
<li>
<a href="#wddup">MusicBrainz entities sharing a Wikidata link</a>
( rows)
</li>
<li>
<a href="#mbdup">Wikidata entities sharing a MusicBrainz link</a>
( rows)
</li>
<li>
<a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a>
( rows)
</li>
<li>
<a href="#discogs2mb">Add missing Wikidata links to MusicBrainz (through Discogs)</a>
( rows)
</li>
<li>
<a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a>
</li>
</ol>
<h2 id="wddup">MusicBrainz entities sharing a Wikidata link</h2>
<h2 id="mbdup">Wikidata entities sharing a MusicBrainz link</h2>
<h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
<h2 id="discogs2mb">Add missing Wikidata links to MusicBrainz (through Discogs)</h2>
<h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
</body>
</html>
""")
with open('../docs/wd-releasegroups-report.html', 'w') as f:
f.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))