musicbrainz-sparql

%run -i ../startup.py
ENTITY_TYPE = 'artist'
Last notebook update: 2021-01-30
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13

Defining *sparql* helper function

Wikidata entities:

https://www.wikidata.org/wiki/Q482994 album

Wikidata properties:

https://www.wikidata.org/wiki/Property:P214 VIAF

https://www.wikidata.org/wiki/Property:P268 BNF

https://www.wikidata.org/wiki/Property:P244 LoC

https://www.wikidata.org/wiki/Property:P1953 discogs artist ID

https://www.wikidata.org/wiki/Property:P434 MusicBrainz Artist ID

examples

Artists from Wikidata

links_type_from_wd = sparql("""
SELECT distinct (count(?artist) as ?cnt) ?ins ?insLabel
WHERE {
  ?artist wdt:P31 ?ins;
    wdt:P434 ?mbid.
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?ins ?insLabel
order by DESC(?cnt)
""")
links_type_from_wd[links_type_from_wd.cnt.astype(int) > 10]
---------------------------------------------------------------------------

RemoteDisconnected                        Traceback (most recent call last)

~/mbz/musicbrainz-sparql/startup.py in <module>
----> 1 links_type_from_wd = sparql("""
      2 SELECT distinct (count(?artist) as ?cnt) ?ins ?insLabel
      3 WHERE {
      4   ?artist wdt:P31 ?ins;
      5     wdt:P434 ?mbid.


~/mbz/musicbrainz-sparql/startup.py in sparql(query, endpoint, **kwargs)
    131     wrapper = SPARQLWrapper2(endpoint)
    132     wrapper.setQuery(query)
--> 133     results = wrapper.query()
    134 
    135     def _clean_url(url):


~/.virtualenvs/mbsparq/lib/python3.9/site-packages/SPARQLWrapper/SmartWrapper.py in query(self)
    310             :rtype: :class:`Bindings` instance
    311         """
--> 312         res = super(SPARQLWrapper2, self).query()
    313 
    314         if self.queryType == SELECT:


~/.virtualenvs/mbsparq/lib/python3.9/site-packages/SPARQLWrapper/Wrapper.py in query(self)
   1105             :rtype: :class:`QueryResult` instance
   1106         """
-> 1107         return QueryResult(self._query())
   1108 
   1109     def queryAndConvert(self):


~/.virtualenvs/mbsparq/lib/python3.9/site-packages/SPARQLWrapper/Wrapper.py in _query(self)
   1071                 response = urlopener(request, timeout=self.timeout)
   1072             else:
-> 1073                 response = urlopener(request)
   1074             return response, self.returnFormat
   1075         except urllib.error.HTTPError as e:


/usr/lib/python3.9/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    212     else:
    213         opener = _opener
--> 214     return opener.open(url, data, timeout)
    215 
    216 def install_opener(opener):


/usr/lib/python3.9/urllib/request.py in open(self, fullurl, data, timeout)
    515 
    516         sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 517         response = self._open(req, data)
    518 
    519         # post-process response


/usr/lib/python3.9/urllib/request.py in _open(self, req, data)
    532 
    533         protocol = req.type
--> 534         result = self._call_chain(self.handle_open, protocol, protocol +
    535                                   '_open', req)
    536         if result:


/usr/lib/python3.9/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    492         for handler in handlers:
    493             func = getattr(handler, meth_name)
--> 494             result = func(*args)
    495             if result is not None:
    496                 return result


/usr/lib/python3.9/urllib/request.py in https_open(self, req)
   1387 
   1388         def https_open(self, req):
-> 1389             return self.do_open(http.client.HTTPSConnection, req,
   1390                 context=self._context, check_hostname=self._check_hostname)
   1391 


/usr/lib/python3.9/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1348             except OSError as err: # timeout error
   1349                 raise URLError(err)
-> 1350             r = h.getresponse()
   1351         except:
   1352             h.close()


/usr/lib/python3.9/http/client.py in getresponse(self)
   1345         try:
   1346             try:
-> 1347                 response.begin()
   1348             except ConnectionError:
   1349                 self.close()


/usr/lib/python3.9/http/client.py in begin(self)
    305         # read until we get a non-100 response
    306         while True:
--> 307             version, status, reason = self._read_status()
    308             if status != CONTINUE:
    309                 break


/usr/lib/python3.9/http/client.py in _read_status(self)
    274             # Presumably, the server closed the connection before
    275             # sending a valid response.
--> 276             raise RemoteDisconnected("Remote end closed connection without"
    277                                      " response")
    278         try:


RemoteDisconnected: Remote end closed connection without response
# linked to MB artist
links_from_wd = sparql("""
SELECT distinct (count(?artist) as ?cnt)
WHERE {
  ?artist wdt:P434 ?mbid .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?artistLabel)
""")
links_from_wd
cnt
0 220459

That’s too many to be fetched. Try the artists with a discogs link

# linked to MB artist
links_from_wd = sparql("""
SELECT distinct (count(?artist) as ?cnt)
WHERE {
  ?artist wdt:P434 ?mbid .
  ?artist wdt:P1953 ?discogs .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?artistLabel)
""")
links_from_wd
cnt
0 139136
# linked to MB work
links_from_wd = sparql("""
SELECT (?artist AS ?wd) ?mbid ?artistLabel ?discogs
WHERE {
  ?artist wdt:P434 ?mbid .
  ?artist wdt:P1953 ?discogs .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?artistLabel)
""")
links_from_wd.rename(columns={'artistLabel': 'name'}, inplace=True)

print('Count:', len(links_from_wd))
display_df(links_from_wd.head())
Count: 139136
wd mbid name discogs
0 Q100147499 3a7d3219-41e1-4f5a-87f4-fbf5c7aed04d Q100147499 1642404
1 Q100251627 a9639067-da3f-4747-b604-0269241d7494 Q100251627 5776203
2 Q100315776 d7c66a58-a91e-48b8-af59-0ea96b3e9f33 Q100315776 4750497
3 Q100450275 a1eec171-4baf-4e5f-8156-4fbc626f1ff8 Q100450275 554436
4 Q100995191 0c159e39-d25a-4d09-a33d-3d72c45f5324 Q100995191 1625893
links_from_mb = sql("""
SELECT
    url.url AS wd,
    artist.gid AS mbid,
    artist.name
FROM 
    artist
    JOIN l_artist_url AS lau ON lau.entity0 = artist.id
    JOIN url                 ON lau.entity1 = url.id
WHERE
    url.url LIKE '%%wikidata.org%%'
ORDER BY
    artist.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)

print('Count:', len(links_from_mb))
display_df(links_from_mb.head())
Count: 200132
wd mbid name
0 Q371 f26c72d3-e52c-467b-b651-679c73d8e1a7 !!!
1 Q343686 8924ec2e-ff39-4968-a686-48effcb39d5e !Action Pact!
2 Q1183824 55cc41c4-d527-4b1a-b45d-4c2ff718f000 !DelaDap
3 Q1622767 54a1dd0e-a535-46a9-9ad8-b49ebe1ce482 !T.O.O.H.!
4 Q14565188 c60674c3-1cd8-4b2c-ada3-8f99d94c4aad !distain
duplicate_wd = links_from_mb[[
    'wd', 'mbid', 'name']].groupby('wd').filter(
    lambda row: len(row.mbid) > 1).sort_values('wd')

print('Count:', len(duplicate_wd))
display_df(duplicate_wd.head())
Count: 4461
wd mbid name
0 Q1004132 98452b79-5c2b-4e1c-9bb8-ff9592c2a061 Bull City Red
1 Q1004132 54143235-e5fb-4747-ae86-0433075a547d George Washington
2 Q100464 661be230-f092-4e64-b5ef-196f4e5f53e7 Wolfgang Roloff
3 Q100464 072f218c-da41-4a7a-b0aa-9aa816dc160c Ronny
4 Q1005957 6054baf8-3a47-4c3a-8c51-eb976f69ecfc Hobo
duplicate_mb = links_from_mb[['wd', 'mbid', 'name']].groupby('mbid').filter(
    lambda row: len(row.mbid) > 1).sort_values('mbid')

print('Count:', len(duplicate_mb))
display_df(duplicate_mb.head())
Count: 244
wd mbid name
0 Q79048065 01793aba-0995-4df1-86f2-baa31b4e4cb7 STAXKK
1 Q79048066 01793aba-0995-4df1-86f2-baa31b4e4cb7 STAXKK
2 Q57314794 06783a0f-cf91-4fe6-97dc-5f303dcdb36c 梅田綾乃
3 Q15908991 06783a0f-cf91-4fe6-97dc-5f303dcdb36c 梅田綾乃
4 Q98443953 10d626a3-ce89-4fe7-b8e0-b502a9148d60 Sebastian Barrera

Data alignment

merge = pd.merge(links_from_wd, links_from_mb, 
                 on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
                 how='outer', indicator=True)
display_df(merge.head())
wd mbid name_wd discogs name_mb _merge
0 Q100147499 3a7d3219-41e1-4f5a-87f4-fbf5c7aed04d Q100147499 1642404 NaN left_only
1 Q100251627 a9639067-da3f-4747-b604-0269241d7494 Q100251627 5776203 NaN left_only
2 Q100315776 d7c66a58-a91e-48b8-af59-0ea96b3e9f33 Q100315776 4750497 NaN left_only
3 Q100450275 a1eec171-4baf-4e5f-8156-4fbc626f1ff8 Q100450275 554436 Alistair Anderson both
4 Q100995191 0c159e39-d25a-4d09-a33d-3d72c45f5324 Q100995191 1625893 NaN left_only
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][[
    'name_mb', 'mbid', 'wd']]

print('Count:', len(links_to_add_to_wd))
display_df(links_to_add_to_wd.head())
Count: 85896
name_mb mbid wd
0 "Kid" Prince Moore 72050802-74fe-46a8-b926-ff08740e1058 Q28008608
1 "Mean" Gene Okerlund 44843557-35d0-4bd1-afae-bd162fb007fc Q955033
2 "a band called David" f583f07d-c1e2-407d-b869-9d7c6a6a34af Q4655343
3 #1 Dads 7aaf9000-eceb-4290-b38d-8cb4242c7730 Q22080866
4 #2Маши c352886d-fa34-4d6e-8c88-688c1c8c29d1 Q62964320
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][[
    'name_wd', 'wd', 'mbid']]
links_to_add_to_mb['edit_link'] = links_to_add_to_mb.apply(
    mb_artist_edit_wd_link, axis=1)

print('Count:', len(links_to_add_to_mb))
display_df(links_to_add_to_mb.head())
Count: 21407
name_wd wd mbid edit_link
0 Q100147499 Q100147499 3a7d3219-41e1-4f5a-87f4-fbf5c7aed04d edit
1 Q100251627 Q100251627 a9639067-da3f-4747-b604-0269241d7494 edit
2 Q100315776 Q100315776 d7c66a58-a91e-48b8-af59-0ea96b3e9f33 edit
3 Q100995191 Q100995191 0c159e39-d25a-4d09-a33d-3d72c45f5324 edit
4 Q101072266 Q101072266 b29c45fe-3e06-4736-9053-1cbe93f61691 edit

Data alignment through Discogs

TBD on wd entries with discogs links and no mb link

# linked to Discogs artist
discogs_links_from_wd = sparql("""
SELECT (?artist AS ?wd) ?artistLabel ?discogs
WHERE {
  ?artist wdt:P1953 ?discogs .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
  MINUS {
    ?artist wdt:P434 ?mbid .  
  }
}
ORDER BY ASC(?artistLabel)
""")
discogs_links_from_wd.rename(columns={'artistLabel': 'name'}, inplace=True)

print('Count:', len(discogs_links_from_wd))
display_df(discogs_links_from_wd.head())
Count: 37976
wd name discogs
0 Q100024805 Q100024805 3469656
1 Q100266269 Q100266269 5025860
2 Q100268968 Q100268968 2763300
3 Q100272130 Q100272130 1005141
4 Q100448674 Q100448674 3792973
discogs_links_from_mb = sql("""
SELECT
    url.url AS discogs,
    artist.gid AS mbid,
    artist.name
FROM 
    artist
    JOIN l_artist_url AS lau ON lau.entity0 = artist.id
    JOIN url                 ON lau.entity1 = url.id
WHERE
    url.url LIKE '%%discogs.com%%'
    AND lau.entity0 IN (
        SELECT
            entity0
        FROM 
            l_artist_url
            JOIN url ON l_artist_url.entity1 = url.id
        WHERE
            url.url LIKE '%%discogs.com%%'
    EXCEPT
        SELECT
            entity0
        FROM 
            l_artist_url
            JOIN url ON l_artist_url.entity1 = url.id
        WHERE
            url.url LIKE '%%wikidata.org%%'
    )
ORDER BY
    artist.name
;
""")
discogs_links_from_mb.discogs = discogs_links_from_mb.discogs.apply(lambda s: s.split('/')[-1])
discogs_links_from_mb.mbid = discogs_links_from_mb.mbid.apply(str)

print('Count:', len(discogs_links_from_mb))
display_df(discogs_links_from_mb.head())
Count: 575584
discogs mbid name
0 2738698 d0d9b2a4-c5f0-4407-90cd-44dfd513158b ! Obtain?
1 1101118 ba813b0d-982a-491a-8648-3b2097b069dd !!!
2 542719 a057fba4-ac0e-4745-88e7-6e89c586e9cb !!!
3 1036695 19605e24-020f-4493-97b5-a4f9880a3ac5 !!*
4 213060 acec84ff-c4cb-420c-96cf-982b651573bd !!Swanhunter
discogs_merge = pd.merge(discogs_links_from_wd, discogs_links_from_mb, 
                 on=['discogs'], suffixes=('_wd', '_mb'),
                 how='inner', indicator=False)
discogs_merge['edit_link'] = discogs_merge.apply(
    mb_artist_edit_wd_link, axis=1)

print('Count:', len(discogs_merge))
display_df(discogs_merge.head())
Count: 8748
wd name_wd discogs mbid name_mb edit_link
0 Q100268968 Q100268968 2763300 9df09077-c63d-45c0-bc60-114589f83996 Attilio Staffelli edit
1 Q101158997 Q101158997 5487155 e4f25fd5-9e46-48b9-b5b8-cf3633a157a1 Floating Sofa Quartet edit
2 Q101429533 Q101429533 2038099 7dd59664-0b60-4eef-9280-491e6c679d37 Robin Fincker edit
3 Q101504558 Q101504558 601285 9554bc87-d985-4f73-a394-5c52fbca6449 Marius Beets edit
4 Q101935166 Q101935166 3201305 e6470441-2cc3-4b3f-b361-92c5f23f9707 Constantin Herzog edit

Report

import jinja2

template = jinja2.Template("""
<!doctype html>

<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Alignment of MusicBrainz and Wikidata Artists</title>
    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
  </head>

  <body style="margin: 20px;">
    <h1>Alignment of MusicBrainz and Wikidata Artists</h1>

    <p>Latest MB database update: </p>
    <p>Latest update: </p>

    <ol>
      <li>
        <a href="#wddup">MusicBrainz entities sharing a Wikidata link</a>
        ( rows)
      </li>
      <li>
        <a href="#mbdup">Wikidata entities sharing a MusicBrainz link</a>
        ( rows)
      </li>
      <li>
        <a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a>
        ( rows)
      </li>
      <li>
        <a href="#discogs2mb">Add missing Wikidata links to MusicBrainz (through Discogs)</a>
        ( rows)
      </li>
      <li>
        <a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a>
      </li>
    </ol>
    
    <h2 id="wddup">MusicBrainz entities sharing a Wikidata link</h2>
    

    <h2 id="mbdup">Wikidata entities sharing a MusicBrainz link</h2>
    

    <h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
    

    <h2 id="discogs2mb">Add missing Wikidata links to MusicBrainz (through Discogs)</h2>
    

    <h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
    
  </body>
</html>
""")

with open('../docs/wd-artists-report.html', 'w') as f:
    f.write(template.render(**globals())
            .replace('&lt;', '<').replace('&gt;', '>')
            .replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
            .replace('thead', 'thead class="thead-light"'))