%run -i ../startup.py
ENTITY_TYPE = 'artist'
Last notebook update: 2021-01-30
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13
Defining *sparql* helper function
Wikidata entities:
https://www.wikidata.org/wiki/Q482994 album
Wikidata properties:
https://www.wikidata.org/wiki/Property:P214 VIAF
https://www.wikidata.org/wiki/Property:P268 BNF
https://www.wikidata.org/wiki/Property:P244 LoC
https://www.wikidata.org/wiki/Property:P1953 discogs artist ID
https://www.wikidata.org/wiki/Property:P434 MusicBrainz Artist ID
examples
links_type_from_wd = sparql("""
SELECT distinct (count(?artist) as ?cnt) ?ins ?insLabel
WHERE {
?artist wdt:P31 ?ins;
wdt:P434 ?mbid.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?ins ?insLabel
order by DESC(?cnt)
""")
links_type_from_wd[links_type_from_wd.cnt.astype(int) > 10]
---------------------------------------------------------------------------
RemoteDisconnected Traceback (most recent call last)
~/mbz/musicbrainz-sparql/startup.py in <module>
----> 1 links_type_from_wd = sparql("""
2 SELECT distinct (count(?artist) as ?cnt) ?ins ?insLabel
3 WHERE {
4 ?artist wdt:P31 ?ins;
5 wdt:P434 ?mbid.
~/mbz/musicbrainz-sparql/startup.py in sparql(query, endpoint, **kwargs)
131 wrapper = SPARQLWrapper2(endpoint)
132 wrapper.setQuery(query)
--> 133 results = wrapper.query()
134
135 def _clean_url(url):
~/.virtualenvs/mbsparq/lib/python3.9/site-packages/SPARQLWrapper/SmartWrapper.py in query(self)
310 :rtype: :class:`Bindings` instance
311 """
--> 312 res = super(SPARQLWrapper2, self).query()
313
314 if self.queryType == SELECT:
~/.virtualenvs/mbsparq/lib/python3.9/site-packages/SPARQLWrapper/Wrapper.py in query(self)
1105 :rtype: :class:`QueryResult` instance
1106 """
-> 1107 return QueryResult(self._query())
1108
1109 def queryAndConvert(self):
~/.virtualenvs/mbsparq/lib/python3.9/site-packages/SPARQLWrapper/Wrapper.py in _query(self)
1071 response = urlopener(request, timeout=self.timeout)
1072 else:
-> 1073 response = urlopener(request)
1074 return response, self.returnFormat
1075 except urllib.error.HTTPError as e:
/usr/lib/python3.9/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
212 else:
213 opener = _opener
--> 214 return opener.open(url, data, timeout)
215
216 def install_opener(opener):
/usr/lib/python3.9/urllib/request.py in open(self, fullurl, data, timeout)
515
516 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 517 response = self._open(req, data)
518
519 # post-process response
/usr/lib/python3.9/urllib/request.py in _open(self, req, data)
532
533 protocol = req.type
--> 534 result = self._call_chain(self.handle_open, protocol, protocol +
535 '_open', req)
536 if result:
/usr/lib/python3.9/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
492 for handler in handlers:
493 func = getattr(handler, meth_name)
--> 494 result = func(*args)
495 if result is not None:
496 return result
/usr/lib/python3.9/urllib/request.py in https_open(self, req)
1387
1388 def https_open(self, req):
-> 1389 return self.do_open(http.client.HTTPSConnection, req,
1390 context=self._context, check_hostname=self._check_hostname)
1391
/usr/lib/python3.9/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1348 except OSError as err: # timeout error
1349 raise URLError(err)
-> 1350 r = h.getresponse()
1351 except:
1352 h.close()
/usr/lib/python3.9/http/client.py in getresponse(self)
1345 try:
1346 try:
-> 1347 response.begin()
1348 except ConnectionError:
1349 self.close()
/usr/lib/python3.9/http/client.py in begin(self)
305 # read until we get a non-100 response
306 while True:
--> 307 version, status, reason = self._read_status()
308 if status != CONTINUE:
309 break
/usr/lib/python3.9/http/client.py in _read_status(self)
274 # Presumably, the server closed the connection before
275 # sending a valid response.
--> 276 raise RemoteDisconnected("Remote end closed connection without"
277 " response")
278 try:
RemoteDisconnected: Remote end closed connection without response
# linked to MB artist
links_from_wd = sparql("""
SELECT distinct (count(?artist) as ?cnt)
WHERE {
?artist wdt:P434 ?mbid .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?artistLabel)
""")
links_from_wd
cnt | |
---|---|
0 | 220459 |
That’s too many to be fetched. Try the artists with a discogs link
# linked to MB artist
links_from_wd = sparql("""
SELECT distinct (count(?artist) as ?cnt)
WHERE {
?artist wdt:P434 ?mbid .
?artist wdt:P1953 ?discogs .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?artistLabel)
""")
links_from_wd
cnt | |
---|---|
0 | 139136 |
# linked to MB work
links_from_wd = sparql("""
SELECT (?artist AS ?wd) ?mbid ?artistLabel ?discogs
WHERE {
?artist wdt:P434 ?mbid .
?artist wdt:P1953 ?discogs .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?artistLabel)
""")
links_from_wd.rename(columns={'artistLabel': 'name'}, inplace=True)
print('Count:', len(links_from_wd))
display_df(links_from_wd.head())
Count: 139136
wd | mbid | name | discogs | |
---|---|---|---|---|
0 | Q100147499 | 3a7d3219-41e1-4f5a-87f4-fbf5c7aed04d | Q100147499 | 1642404 |
1 | Q100251627 | a9639067-da3f-4747-b604-0269241d7494 | Q100251627 | 5776203 |
2 | Q100315776 | d7c66a58-a91e-48b8-af59-0ea96b3e9f33 | Q100315776 | 4750497 |
3 | Q100450275 | a1eec171-4baf-4e5f-8156-4fbc626f1ff8 | Q100450275 | 554436 |
4 | Q100995191 | 0c159e39-d25a-4d09-a33d-3d72c45f5324 | Q100995191 | 1625893 |
links_from_mb = sql("""
SELECT
url.url AS wd,
artist.gid AS mbid,
artist.name
FROM
artist
JOIN l_artist_url AS lau ON lau.entity0 = artist.id
JOIN url ON lau.entity1 = url.id
WHERE
url.url LIKE '%%wikidata.org%%'
ORDER BY
artist.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)
print('Count:', len(links_from_mb))
display_df(links_from_mb.head())
Count: 200132
wd | mbid | name | |
---|---|---|---|
0 | Q371 | f26c72d3-e52c-467b-b651-679c73d8e1a7 | !!! |
1 | Q343686 | 8924ec2e-ff39-4968-a686-48effcb39d5e | !Action Pact! |
2 | Q1183824 | 55cc41c4-d527-4b1a-b45d-4c2ff718f000 | !DelaDap |
3 | Q1622767 | 54a1dd0e-a535-46a9-9ad8-b49ebe1ce482 | !T.O.O.H.! |
4 | Q14565188 | c60674c3-1cd8-4b2c-ada3-8f99d94c4aad | !distain |
duplicate_wd = links_from_mb[[
'wd', 'mbid', 'name']].groupby('wd').filter(
lambda row: len(row.mbid) > 1).sort_values('wd')
print('Count:', len(duplicate_wd))
display_df(duplicate_wd.head())
Count: 4461
wd | mbid | name | |
---|---|---|---|
0 | Q1004132 | 98452b79-5c2b-4e1c-9bb8-ff9592c2a061 | Bull City Red |
1 | Q1004132 | 54143235-e5fb-4747-ae86-0433075a547d | George Washington |
2 | Q100464 | 661be230-f092-4e64-b5ef-196f4e5f53e7 | Wolfgang Roloff |
3 | Q100464 | 072f218c-da41-4a7a-b0aa-9aa816dc160c | Ronny |
4 | Q1005957 | 6054baf8-3a47-4c3a-8c51-eb976f69ecfc | Hobo |
duplicate_mb = links_from_mb[['wd', 'mbid', 'name']].groupby('mbid').filter(
lambda row: len(row.mbid) > 1).sort_values('mbid')
print('Count:', len(duplicate_mb))
display_df(duplicate_mb.head())
Count: 244
wd | mbid | name | |
---|---|---|---|
0 | Q79048065 | 01793aba-0995-4df1-86f2-baa31b4e4cb7 | STAXKK |
1 | Q79048066 | 01793aba-0995-4df1-86f2-baa31b4e4cb7 | STAXKK |
2 | Q57314794 | 06783a0f-cf91-4fe6-97dc-5f303dcdb36c | 梅田綾乃 |
3 | Q15908991 | 06783a0f-cf91-4fe6-97dc-5f303dcdb36c | 梅田綾乃 |
4 | Q98443953 | 10d626a3-ce89-4fe7-b8e0-b502a9148d60 | Sebastian Barrera |
merge = pd.merge(links_from_wd, links_from_mb,
on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
how='outer', indicator=True)
display_df(merge.head())
wd | mbid | name_wd | discogs | name_mb | _merge | |
---|---|---|---|---|---|---|
0 | Q100147499 | 3a7d3219-41e1-4f5a-87f4-fbf5c7aed04d | Q100147499 | 1642404 | NaN | left_only |
1 | Q100251627 | a9639067-da3f-4747-b604-0269241d7494 | Q100251627 | 5776203 | NaN | left_only |
2 | Q100315776 | d7c66a58-a91e-48b8-af59-0ea96b3e9f33 | Q100315776 | 4750497 | NaN | left_only |
3 | Q100450275 | a1eec171-4baf-4e5f-8156-4fbc626f1ff8 | Q100450275 | 554436 | Alistair Anderson | both |
4 | Q100995191 | 0c159e39-d25a-4d09-a33d-3d72c45f5324 | Q100995191 | 1625893 | NaN | left_only |
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][[
'name_mb', 'mbid', 'wd']]
print('Count:', len(links_to_add_to_wd))
display_df(links_to_add_to_wd.head())
Count: 85896
name_mb | mbid | wd | |
---|---|---|---|
0 | "Kid" Prince Moore | 72050802-74fe-46a8-b926-ff08740e1058 | Q28008608 |
1 | "Mean" Gene Okerlund | 44843557-35d0-4bd1-afae-bd162fb007fc | Q955033 |
2 | "a band called David" | f583f07d-c1e2-407d-b869-9d7c6a6a34af | Q4655343 |
3 | #1 Dads | 7aaf9000-eceb-4290-b38d-8cb4242c7730 | Q22080866 |
4 | #2Маши | c352886d-fa34-4d6e-8c88-688c1c8c29d1 | Q62964320 |
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][[
'name_wd', 'wd', 'mbid']]
links_to_add_to_mb['edit_link'] = links_to_add_to_mb.apply(
mb_artist_edit_wd_link, axis=1)
print('Count:', len(links_to_add_to_mb))
display_df(links_to_add_to_mb.head())
Count: 21407
name_wd | wd | mbid | edit_link | |
---|---|---|---|---|
0 | Q100147499 | Q100147499 | 3a7d3219-41e1-4f5a-87f4-fbf5c7aed04d | edit |
1 | Q100251627 | Q100251627 | a9639067-da3f-4747-b604-0269241d7494 | edit |
2 | Q100315776 | Q100315776 | d7c66a58-a91e-48b8-af59-0ea96b3e9f33 | edit |
3 | Q100995191 | Q100995191 | 0c159e39-d25a-4d09-a33d-3d72c45f5324 | edit |
4 | Q101072266 | Q101072266 | b29c45fe-3e06-4736-9053-1cbe93f61691 | edit |
TBD on wd entries with discogs links and no mb link
# linked to Discogs artist
discogs_links_from_wd = sparql("""
SELECT (?artist AS ?wd) ?artistLabel ?discogs
WHERE {
?artist wdt:P1953 ?discogs .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
MINUS {
?artist wdt:P434 ?mbid .
}
}
ORDER BY ASC(?artistLabel)
""")
discogs_links_from_wd.rename(columns={'artistLabel': 'name'}, inplace=True)
print('Count:', len(discogs_links_from_wd))
display_df(discogs_links_from_wd.head())
Count: 37976
wd | name | discogs | |
---|---|---|---|
0 | Q100024805 | Q100024805 | 3469656 |
1 | Q100266269 | Q100266269 | 5025860 |
2 | Q100268968 | Q100268968 | 2763300 |
3 | Q100272130 | Q100272130 | 1005141 |
4 | Q100448674 | Q100448674 | 3792973 |
discogs_links_from_mb = sql("""
SELECT
url.url AS discogs,
artist.gid AS mbid,
artist.name
FROM
artist
JOIN l_artist_url AS lau ON lau.entity0 = artist.id
JOIN url ON lau.entity1 = url.id
WHERE
url.url LIKE '%%discogs.com%%'
AND lau.entity0 IN (
SELECT
entity0
FROM
l_artist_url
JOIN url ON l_artist_url.entity1 = url.id
WHERE
url.url LIKE '%%discogs.com%%'
EXCEPT
SELECT
entity0
FROM
l_artist_url
JOIN url ON l_artist_url.entity1 = url.id
WHERE
url.url LIKE '%%wikidata.org%%'
)
ORDER BY
artist.name
;
""")
discogs_links_from_mb.discogs = discogs_links_from_mb.discogs.apply(lambda s: s.split('/')[-1])
discogs_links_from_mb.mbid = discogs_links_from_mb.mbid.apply(str)
print('Count:', len(discogs_links_from_mb))
display_df(discogs_links_from_mb.head())
Count: 575584
discogs | mbid | name | |
---|---|---|---|
0 | 2738698 | d0d9b2a4-c5f0-4407-90cd-44dfd513158b | ! Obtain? |
1 | 1101118 | ba813b0d-982a-491a-8648-3b2097b069dd | !!! |
2 | 542719 | a057fba4-ac0e-4745-88e7-6e89c586e9cb | !!! |
3 | 1036695 | 19605e24-020f-4493-97b5-a4f9880a3ac5 | !!* |
4 | 213060 | acec84ff-c4cb-420c-96cf-982b651573bd | !!Swanhunter |
discogs_merge = pd.merge(discogs_links_from_wd, discogs_links_from_mb,
on=['discogs'], suffixes=('_wd', '_mb'),
how='inner', indicator=False)
discogs_merge['edit_link'] = discogs_merge.apply(
mb_artist_edit_wd_link, axis=1)
print('Count:', len(discogs_merge))
display_df(discogs_merge.head())
Count: 8748
wd | name_wd | discogs | mbid | name_mb | edit_link | |
---|---|---|---|---|---|---|
0 | Q100268968 | Q100268968 | 2763300 | 9df09077-c63d-45c0-bc60-114589f83996 | Attilio Staffelli | edit |
1 | Q101158997 | Q101158997 | 5487155 | e4f25fd5-9e46-48b9-b5b8-cf3633a157a1 | Floating Sofa Quartet | edit |
2 | Q101429533 | Q101429533 | 2038099 | 7dd59664-0b60-4eef-9280-491e6c679d37 | Robin Fincker | edit |
3 | Q101504558 | Q101504558 | 601285 | 9554bc87-d985-4f73-a394-5c52fbca6449 | Marius Beets | edit |
4 | Q101935166 | Q101935166 | 3201305 | e6470441-2cc3-4b3f-b361-92c5f23f9707 | Constantin Herzog | edit |
import jinja2
template = jinja2.Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Alignment of MusicBrainz and Wikidata Artists</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Alignment of MusicBrainz and Wikidata Artists</h1>
<p>Latest MB database update: </p>
<p>Latest update: </p>
<ol>
<li>
<a href="#wddup">MusicBrainz entities sharing a Wikidata link</a>
( rows)
</li>
<li>
<a href="#mbdup">Wikidata entities sharing a MusicBrainz link</a>
( rows)
</li>
<li>
<a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a>
( rows)
</li>
<li>
<a href="#discogs2mb">Add missing Wikidata links to MusicBrainz (through Discogs)</a>
( rows)
</li>
<li>
<a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a>
</li>
</ol>
<h2 id="wddup">MusicBrainz entities sharing a Wikidata link</h2>
<h2 id="mbdup">Wikidata entities sharing a MusicBrainz link</h2>
<h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
<h2 id="discogs2mb">Add missing Wikidata links to MusicBrainz (through Discogs)</h2>
<h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
</body>
</html>
""")
with open('../docs/wd-artists-report.html', 'w') as f:
f.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))