musicbrainz-sparql

%run -i ../startup.py
ENTITY_TYPE = 'work'
Last notebook update: 2021-01-28
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13

Defining *sparql* helper function

Wikidata entities:

https://www.wikidata.org/wiki/Q2188189 musical work

Wikidata properties:

https://www.wikidata.org/wiki/Property:P86 composer

https://www.wikidata.org/wiki/Property:P870 instrumentation

https://www.wikidata.org/wiki/Property:P214 VIAF

https://www.wikidata.org/wiki/Property:P268 BNF

https://www.wikidata.org/wiki/Property:P244 LoC

https://www.wikidata.org/wiki/Property:P839 IMSLP

https://www.wikidata.org/wiki/Property:P435 MusicBrainz work ID

https://www.wikidata.org/wiki/Property:P1994 AllMusic composition ID

https://www.wikidata.org/wiki/Property:P5229 Carnegie Hall work ID

https://www.wikidata.org/wiki/Property:P6080 Discogs composition ID (obsoleted by discogs in October 2019)

examples

https://www.wikidata.org/wiki/Q3478907

Works from Wikidata

links_type_from_wd = sparql("""
SELECT distinct (count(?work) as ?cnt) ?ins ?insLabel
WHERE {
  ?work wdt:P31 ?ins;
    wdt:P435 ?mbid.
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?ins ?insLabel
order by DESC(?cnt)
""")
links_type_from_wd[links_type_from_wd.cnt.astype(int) > 10]
cnt ins insLabel
0 12075 Q134556 single
1 12020 Q7366 song
2 5297 Q207628 musical composition
3 1165 Q1344 opera
4 679 Q7889 video game
... ... ... ...
105 12 Q1667921 novel series
106 12 Q2302678 clarinet concerto
107 12 Q30340773 quintet
108 11 Q211025 march
109 11 Q541947 anthem

110 rows × 3 columns

sparql("""
SELECT (COUNT(?work) AS ?cnt)
WHERE {
  ?work wdt:P435 ?mbid.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""")
cnt
0 34385
# linked to MB work
links_from_wd = sparql("""
SELECT (?work AS ?wd) ?mbid ?workLabel (GROUP_CONCAT(?categoryLabel; SEPARATOR=", ") AS ?workType)
WHERE {
  ?work wdt:P435 ?mbid .
  ?work wdt:P31 ?category .
  ?category rdfs:label ?categoryLabel FILTER(LANG(?categoryLabel) = "en")
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
GROUP BY ?work ?mbid ?workLabel
ORDER BY ASC(?workType) ASC(?workLabel)
""")
links_from_wd.rename(columns={'workLabel': 'name'}, inplace=True)
print('Count:', len(links_from_wd))
display_df(links_from_wd.head())
Count: 33886
wd mbid name workType
0 Q5151283 0d6b1470-53a5-4885-9aaf-2157a43566a1 Come, Thou Long Expected Jesus Advent song, Christmas hymn
1 Q5151283 3b4fb873-1df3-3a82-b5ab-2a9325bf1875 Come, Thou Long Expected Jesus Advent song, Christmas hymn
2 Q5151283 87c901cf-fd2f-479d-ad07-668b405c1ee0 Come, Thou Long Expected Jesus Advent song, Christmas hymn
3 Q5151283 9ce6cf52-e963-455b-97ae-598b99547f91 Come, Thou Long Expected Jesus Advent song, Christmas hymn
4 Q16147392 3ae52335-d8c7-45d3-9940-ff6cc7de0b63 All Things Bright and Beautiful Anglican hymn
links_from_mb = sql("""
SELECT
    url.url AS wd,
    work.gid AS mbid,
    work.name
FROM work
JOIN l_url_work AS luw ON luw.entity1 = work.id
JOIN url               ON luw.entity0 = url.id
WHERE
    url.url LIKE '%%wikidata.org%%'
ORDER BY work.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)

print('Count:', len(links_from_mb))
display_df(links_from_mb.head())
Count: 31842
wd mbid name
0 Q1195883 a0c2aa81-3185-412d-93f2-eefe4606c70e "Der Morgen und der Abend" 12 Musikstücke für das Hornwerk "Salzburger Stier" der Festung Hohensalzburg
1 Q1339833 054f14c0-8edf-43b5-8946-88f9bd6c3dc1 "Geistervariationen" in E-flat major for piano solo, WoO 24
2 Q10513545 76fdbe87-6c69-4c54-a6de-97a2dd3d2517 "Ô meilleur des hommes! Celui qui t'aime est le véritable croyant"
3 Q3879355 55b4b32d-0908-4500-8ec5-f14b77953e92 #1 Crush
4 Q945195 187f83e3-1a1f-3623-8e71-109442414cb0 #9 Dream
duplicate_wd = links_from_mb[[
    'wd', 'mbid', 'name']].groupby('wd').filter(
    lambda row: len(row.mbid) > 1).sort_values('wd')

print('Count:', len(duplicate_wd))
display_df(duplicate_wd.head())
Count: 1652
wd mbid name
0 Q1000352 948cc231-12a7-4b13-8022-e7e6e75ee8ba Reckless
1 Q1000352 35765848-0832-402c-91b5-8b505919e271 Reckless: Steinernes Fleisch
2 Q1027675 447b9b10-f278-4b13-b55a-8a925811260c Got to Give It Up
3 Q1027675 ba2dfdad-4591-3de3-9e1f-f9f0a1071f76 Got to Give It Up
4 Q1028491 e8e21649-afb6-338f-9c89-ac48dfe8df0a Camelot
duplicate_mb = links_from_mb[['wd', 'mbid', 'name']].groupby('mbid').filter(
    lambda row: len(row.mbid) > 1).sort_values('mbid')

print('Count:', len(duplicate_mb))
display_df(duplicate_mb.head())
Count: 38
wd mbid name
0 Q62565775 088c2a4a-e9c3-46ee-ae23-8acb4d4294e4 Deutschland
1 Q62566152 088c2a4a-e9c3-46ee-ae23-8acb4d4294e4 Deutschland
2 Q29158512 0b5def1a-3892-4716-a7a2-2ee02b10c55f Images, Livre 1, L. 110, CD 105
3 Q1132712 0b5def1a-3892-4716-a7a2-2ee02b10c55f Images, Livre 1, L. 110, CD 105
4 Q512715 1319ddc3-910a-4c93-afc7-c5e5973f3b78 Préludes, Livre I, L. 117, CD 125

Data alignment

merge = pd.merge(links_from_wd, links_from_mb, 
                 on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
                 how='outer', indicator=True)
display_df(merge.head())
wd mbid name_wd workType name_mb _merge
0 Q5151283 0d6b1470-53a5-4885-9aaf-2157a43566a1 Come, Thou Long Expected Jesus Advent song, Christmas hymn Come, Thou Long Expected Jesus both
1 Q5151283 3b4fb873-1df3-3a82-b5ab-2a9325bf1875 Come, Thou Long Expected Jesus Advent song, Christmas hymn Come, Thou Long Expected Jesus both
2 Q5151283 87c901cf-fd2f-479d-ad07-668b405c1ee0 Come, Thou Long Expected Jesus Advent song, Christmas hymn Come, Thou Long Expected Jesus both
3 Q5151283 9ce6cf52-e963-455b-97ae-598b99547f91 Come, Thou Long Expected Jesus Advent song, Christmas hymn Come, Thou Long Expected Jesus both
4 Q331595 09d0b344-1396-4985-a3aa-d4b4db7e2b33 Veni, Veni Emmanuel Advent song, Christmas hymn O Come, O Come, Emmanuel both
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][[
    'name_mb', 'mbid', 'wd']]

print('Count:', len(links_to_add_to_wd))
display_df(links_to_add_to_wd.head())
Count: 4692
name_mb mbid wd
0 "Ô meilleur des hommes! Celui qui t'aime est le véritable croyant" 76fdbe87-6c69-4c54-a6de-97a2dd3d2517 Q10513545
1 (Can’t Get My) Head Around You a7757771-757b-4ff8-9736-7ad065bd2bd3 Q2550692
2 (It’s All Down to) Goodnight Vienna c740d2c6-3f54-42ca-8dc5-6a0fd2f1e2f2 Q4041672
3 (I’m Always Touched by Your) Presence, Dear 299b84fe-bf16-3e22-b2c3-edf9796e0b0d Q837484
4 (I’m Gonna) Love Me Again 94bbf621-3dd9-4f2b-88b5-4f3b3cebac5d Q77816604
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][[
    'name_wd', 'wd', 'mbid', 'workType']]
links_to_add_to_mb['edit_link'] = links_to_add_to_mb.apply(
    mb_work_edit_wd_link, axis=1)

print('Count:', len(links_to_add_to_mb))
display_df(links_to_add_to_mb.head())
Count: 6735
name_wd wd mbid workType edit_link
0 Veni, Veni Emmanuel Q331595 0ae49426-2891-4ed3-aec6-ee95369fd0c8 Advent song, Christmas hymn edit
1 Veni, Veni Emmanuel Q331595 523d5269-0212-41ae-997e-294267e5ddbf Advent song, Christmas hymn edit
2 Entfliehet, verschwindet, entweichet, ihr Sorgen, BWV 249a Q1249980 a5c89bd4-986d-4af4-82a0-e6d4bafc7e0c Bach cantata edit
3 Ich bin in mir vergnügt, BWV 204 Q3147633 e01a83bb-d8d1-4295-b6aa-22a28cc04d85 Bach cantata edit
4 Non sa che sia dolore, BWV 209 Q2784522 43fa1a90-7f72-4e45-905f-3d845d500f66 Bach cantata edit

Report

import jinja2

template = jinja2.Template("""
<!doctype html>

<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Alignment of MusicBrainz and Wikidata Works</title>
    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
  </head>

  <body style="margin: 20px;">
    <h1>Alignment of MusicBrainz and Wikidata Works</h1>

    <p>Latest MB database update: </p>
    <p>Latest update: </p>

    <ol>
      <li>
        <a href="#wddup">MusicBrainz entities sharing a Wikidata link</a>
        ( rows)
      </li>
      <li>
        <a href="#mbdup">Wikidata entities sharing a MusicBrainz link</a>
        ( rows)
      </li>
      <li>
        <a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a>
        ( rows)
      </li>
      <li>
        <a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a>
        ( rows)
      </li>
    </ol>
    
    <h2 id="wddup">MusicBrainz entities sharing a Wikidata link</h2>
    

    <h2 id="mbdup">Wikidata entities sharing a MusicBrainz link</h2>
    

    <h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
    

    <h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
    
  </body>
</html>
""")

with open('../docs/wd-works-report.html', 'w') as f:
    f.write(template.render(**globals())
            .replace('&lt;', '<').replace('&gt;', '>')
            .replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
            .replace('thead', 'thead class="thead-light"'))