%run -i ../startup.py
ENTITY_TYPE = 'work'
Last notebook update: 2021-01-28
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13
Defining *sparql* helper function
Wikidata entities:
https://www.wikidata.org/wiki/Q2188189 musical work
Wikidata properties:
https://www.wikidata.org/wiki/Property:P86 composer
https://www.wikidata.org/wiki/Property:P870 instrumentation
https://www.wikidata.org/wiki/Property:P214 VIAF
https://www.wikidata.org/wiki/Property:P268 BNF
https://www.wikidata.org/wiki/Property:P244 LoC
https://www.wikidata.org/wiki/Property:P839 IMSLP
https://www.wikidata.org/wiki/Property:P435 MusicBrainz work ID
https://www.wikidata.org/wiki/Property:P1994 AllMusic composition ID
https://www.wikidata.org/wiki/Property:P5229 Carnegie Hall work ID
https://www.wikidata.org/wiki/Property:P6080 Discogs composition ID (obsoleted by discogs in October 2019)
examples
https://www.wikidata.org/wiki/Q3478907
links_type_from_wd = sparql("""
SELECT distinct (count(?work) as ?cnt) ?ins ?insLabel
WHERE {
?work wdt:P31 ?ins;
wdt:P435 ?mbid.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?ins ?insLabel
order by DESC(?cnt)
""")
links_type_from_wd[links_type_from_wd.cnt.astype(int) > 10]
cnt | ins | insLabel | |
---|---|---|---|
0 | 12075 | Q134556 | single |
1 | 12020 | Q7366 | song |
2 | 5297 | Q207628 | musical composition |
3 | 1165 | Q1344 | opera |
4 | 679 | Q7889 | video game |
... | ... | ... | ... |
105 | 12 | Q1667921 | novel series |
106 | 12 | Q2302678 | clarinet concerto |
107 | 12 | Q30340773 | quintet |
108 | 11 | Q211025 | march |
109 | 11 | Q541947 | anthem |
110 rows × 3 columns
sparql("""
SELECT (COUNT(?work) AS ?cnt)
WHERE {
?work wdt:P435 ?mbid.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""")
cnt | |
---|---|
0 | 34385 |
# linked to MB work
links_from_wd = sparql("""
SELECT (?work AS ?wd) ?mbid ?workLabel (GROUP_CONCAT(?categoryLabel; SEPARATOR=", ") AS ?workType)
WHERE {
?work wdt:P435 ?mbid .
?work wdt:P31 ?category .
?category rdfs:label ?categoryLabel FILTER(LANG(?categoryLabel) = "en")
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
GROUP BY ?work ?mbid ?workLabel
ORDER BY ASC(?workType) ASC(?workLabel)
""")
links_from_wd.rename(columns={'workLabel': 'name'}, inplace=True)
print('Count:', len(links_from_wd))
display_df(links_from_wd.head())
Count: 33886
wd | mbid | name | workType | |
---|---|---|---|---|
0 | Q5151283 | 0d6b1470-53a5-4885-9aaf-2157a43566a1 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn |
1 | Q5151283 | 3b4fb873-1df3-3a82-b5ab-2a9325bf1875 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn |
2 | Q5151283 | 87c901cf-fd2f-479d-ad07-668b405c1ee0 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn |
3 | Q5151283 | 9ce6cf52-e963-455b-97ae-598b99547f91 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn |
4 | Q16147392 | 3ae52335-d8c7-45d3-9940-ff6cc7de0b63 | All Things Bright and Beautiful | Anglican hymn |
links_from_mb = sql("""
SELECT
url.url AS wd,
work.gid AS mbid,
work.name
FROM work
JOIN l_url_work AS luw ON luw.entity1 = work.id
JOIN url ON luw.entity0 = url.id
WHERE
url.url LIKE '%%wikidata.org%%'
ORDER BY work.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)
print('Count:', len(links_from_mb))
display_df(links_from_mb.head())
Count: 31842
wd | mbid | name | |
---|---|---|---|
0 | Q1195883 | a0c2aa81-3185-412d-93f2-eefe4606c70e | "Der Morgen und der Abend" 12 Musikstücke für das Hornwerk "Salzburger Stier" der Festung Hohensalzburg |
1 | Q1339833 | 054f14c0-8edf-43b5-8946-88f9bd6c3dc1 | "Geistervariationen" in E-flat major for piano solo, WoO 24 |
2 | Q10513545 | 76fdbe87-6c69-4c54-a6de-97a2dd3d2517 | "Ô meilleur des hommes! Celui qui t'aime est le véritable croyant" |
3 | Q3879355 | 55b4b32d-0908-4500-8ec5-f14b77953e92 | #1 Crush |
4 | Q945195 | 187f83e3-1a1f-3623-8e71-109442414cb0 | #9 Dream |
duplicate_wd = links_from_mb[[
'wd', 'mbid', 'name']].groupby('wd').filter(
lambda row: len(row.mbid) > 1).sort_values('wd')
print('Count:', len(duplicate_wd))
display_df(duplicate_wd.head())
Count: 1652
wd | mbid | name | |
---|---|---|---|
0 | Q1000352 | 948cc231-12a7-4b13-8022-e7e6e75ee8ba | Reckless |
1 | Q1000352 | 35765848-0832-402c-91b5-8b505919e271 | Reckless: Steinernes Fleisch |
2 | Q1027675 | 447b9b10-f278-4b13-b55a-8a925811260c | Got to Give It Up |
3 | Q1027675 | ba2dfdad-4591-3de3-9e1f-f9f0a1071f76 | Got to Give It Up |
4 | Q1028491 | e8e21649-afb6-338f-9c89-ac48dfe8df0a | Camelot |
duplicate_mb = links_from_mb[['wd', 'mbid', 'name']].groupby('mbid').filter(
lambda row: len(row.mbid) > 1).sort_values('mbid')
print('Count:', len(duplicate_mb))
display_df(duplicate_mb.head())
Count: 38
wd | mbid | name | |
---|---|---|---|
0 | Q62565775 | 088c2a4a-e9c3-46ee-ae23-8acb4d4294e4 | Deutschland |
1 | Q62566152 | 088c2a4a-e9c3-46ee-ae23-8acb4d4294e4 | Deutschland |
2 | Q29158512 | 0b5def1a-3892-4716-a7a2-2ee02b10c55f | Images, Livre 1, L. 110, CD 105 |
3 | Q1132712 | 0b5def1a-3892-4716-a7a2-2ee02b10c55f | Images, Livre 1, L. 110, CD 105 |
4 | Q512715 | 1319ddc3-910a-4c93-afc7-c5e5973f3b78 | Préludes, Livre I, L. 117, CD 125 |
merge = pd.merge(links_from_wd, links_from_mb,
on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
how='outer', indicator=True)
display_df(merge.head())
wd | mbid | name_wd | workType | name_mb | _merge | |
---|---|---|---|---|---|---|
0 | Q5151283 | 0d6b1470-53a5-4885-9aaf-2157a43566a1 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn | Come, Thou Long Expected Jesus | both |
1 | Q5151283 | 3b4fb873-1df3-3a82-b5ab-2a9325bf1875 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn | Come, Thou Long Expected Jesus | both |
2 | Q5151283 | 87c901cf-fd2f-479d-ad07-668b405c1ee0 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn | Come, Thou Long Expected Jesus | both |
3 | Q5151283 | 9ce6cf52-e963-455b-97ae-598b99547f91 | Come, Thou Long Expected Jesus | Advent song, Christmas hymn | Come, Thou Long Expected Jesus | both |
4 | Q331595 | 09d0b344-1396-4985-a3aa-d4b4db7e2b33 | Veni, Veni Emmanuel | Advent song, Christmas hymn | O Come, O Come, Emmanuel | both |
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][[
'name_mb', 'mbid', 'wd']]
print('Count:', len(links_to_add_to_wd))
display_df(links_to_add_to_wd.head())
Count: 4692
name_mb | mbid | wd | |
---|---|---|---|
0 | "Ô meilleur des hommes! Celui qui t'aime est le véritable croyant" | 76fdbe87-6c69-4c54-a6de-97a2dd3d2517 | Q10513545 |
1 | (Can’t Get My) Head Around You | a7757771-757b-4ff8-9736-7ad065bd2bd3 | Q2550692 |
2 | (It’s All Down to) Goodnight Vienna | c740d2c6-3f54-42ca-8dc5-6a0fd2f1e2f2 | Q4041672 |
3 | (I’m Always Touched by Your) Presence, Dear | 299b84fe-bf16-3e22-b2c3-edf9796e0b0d | Q837484 |
4 | (I’m Gonna) Love Me Again | 94bbf621-3dd9-4f2b-88b5-4f3b3cebac5d | Q77816604 |
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][[
'name_wd', 'wd', 'mbid', 'workType']]
links_to_add_to_mb['edit_link'] = links_to_add_to_mb.apply(
mb_work_edit_wd_link, axis=1)
print('Count:', len(links_to_add_to_mb))
display_df(links_to_add_to_mb.head())
Count: 6735
name_wd | wd | mbid | workType | edit_link | |
---|---|---|---|---|---|
0 | Veni, Veni Emmanuel | Q331595 | 0ae49426-2891-4ed3-aec6-ee95369fd0c8 | Advent song, Christmas hymn | edit |
1 | Veni, Veni Emmanuel | Q331595 | 523d5269-0212-41ae-997e-294267e5ddbf | Advent song, Christmas hymn | edit |
2 | Entfliehet, verschwindet, entweichet, ihr Sorgen, BWV 249a | Q1249980 | a5c89bd4-986d-4af4-82a0-e6d4bafc7e0c | Bach cantata | edit |
3 | Ich bin in mir vergnügt, BWV 204 | Q3147633 | e01a83bb-d8d1-4295-b6aa-22a28cc04d85 | Bach cantata | edit |
4 | Non sa che sia dolore, BWV 209 | Q2784522 | 43fa1a90-7f72-4e45-905f-3d845d500f66 | Bach cantata | edit |
import jinja2
template = jinja2.Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Alignment of MusicBrainz and Wikidata Works</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Alignment of MusicBrainz and Wikidata Works</h1>
<p>Latest MB database update: </p>
<p>Latest update: </p>
<ol>
<li>
<a href="#wddup">MusicBrainz entities sharing a Wikidata link</a>
( rows)
</li>
<li>
<a href="#mbdup">Wikidata entities sharing a MusicBrainz link</a>
( rows)
</li>
<li>
<a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a>
( rows)
</li>
<li>
<a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a>
( rows)
</li>
</ol>
<h2 id="wddup">MusicBrainz entities sharing a Wikidata link</h2>
<h2 id="mbdup">Wikidata entities sharing a MusicBrainz link</h2>
<h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
<h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
</body>
</html>
""")
with open('../docs/wd-works-report.html', 'w') as f:
f.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))