%run -i ../startup.py
ENTITY_TYPE = 'instrument'
Last notebook update: 2021-01-28
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13
Defining *sparql* helper function
Wikidata entities which are musical instruments or families of musical instruments:
# instance of musical instrument
wd_musical_instruments = sparql("""
SELECT ?instrument ?instrumentLabel ?HornbostelSachs
WHERE {
{ ?instrument wdt:P31* wd:Q34379 . }
UNION
{ ?instrument wdt:P31 wd:Q1254773 . }
OPTIONAL
{ ?instrument wdt:P1762 ?HornbostelSachs . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""")
wd_musical_instruments.rename(columns={
'instrument': 'wd', 'instrumentLabel': 'name'}, inplace=True)
wd_musical_instruments.head()
wd | name | HornbostelSachs | |
---|---|---|---|
0 | Q55724333 | friction drums with free stick | 231.13 |
1 | Q55724337 | rotating friction drums | 232.2 |
2 | Q55724342 | friction drums with tied stick | 231.2 |
3 | Q55724561 | stationary friction drums with friction cord | 232.1 |
4 | Q55724566 | single-skin stationary drums with friction cord | 232.11 |
Entities with “instrumental” links to MB:
# linked to MB instrument
links_from_wd = sparql("""
SELECT (?instrument AS ?wd) ?mbid ?instrumentLabel
WHERE {
?instrument wdt:P1330 ?mbid .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?instrumentLabel)
""")
links_from_wd.rename(columns={'instrumentLabel': 'name'}, inplace=True)
display_df(links_from_wd.head())
wd | mbid | name | |
---|---|---|---|
0 | Q10751910 | 1c70cc38-deee-4a84-9b16-7a81c0f43aed | Q10751910 |
1 | Q13094251 | f6b76abc-fdef-444c-97b1-27b12e3e3c0b | Q13094251 |
2 | Q13094253 | 176ce29f-bcf0-47f5-beda-c97ef6df6480 | Q13094253 |
3 | Q13094254 | 14acd267-2e1b-459d-b41e-058d2c106345 | Q13094254 |
4 | Q13094255 | 4ac4b541-116d-40c1-93c7-160153ca53ac | Q13094255 |
Probably needs cleanup
set([wd for wd in links_from_wd.wd
if links_from_wd.wd.to_list().count(wd) > 1])
{'Q1398629'}
set([mbid for mbid in links_from_wd.mbid
if links_from_wd.mbid.to_list().count(mbid) > 1])
{'291e2d38-cdc9-4b9e-b592-5d653f32da6c',
'd5b46baa-37fc-46cc-b99f-c455ce6e6a9c'}
links_from_mb = sql("""
SELECT
url.url AS wd,
instrument.gid AS mbid,
instrument.name
FROM url
JOIN l_instrument_url AS llu ON llu.entity1 = url.id
JOIN instrument ON llu.entity0 = instrument.id
WHERE
url.url LIKE '%%wikidata.org%%'
ORDER BY instrument.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)
display_df(links_from_mb.head())
wd | mbid | name | |
---|---|---|---|
0 | Q678090 | d5cc3c69-218e-449a-b80d-8bd7a61311a1 | 12 string guitar |
1 | Q4118803 | 21468ce3-bad3-4f48-a2ff-01e6b0bc9ca2 | 17-string bass koto |
2 | Q1024685 | 33b6ba89-8265-4d8f-bbdc-ecff41e29e8c | Afuche/Cabasa |
3 | Q18639099 | 26b4608d-acc3-46f3-b509-d83bd798e122 | Anglo concertina |
4 | Q2145031 | e618d02c-41c0-475c-be70-7ef0f92da7d0 | Appalachian dulcimer |
Probably needs cleanup
set([wd for wd in links_from_mb.wd
if links_from_mb.wd.to_list().count(wd) > 1])
{'Q1398629', 'Q5266546'}
set([mbid for mbid in links_from_mb.mbid
if links_from_mb.mbid.to_list().count(mbid) > 1])
{'291e2d38-cdc9-4b9e-b592-5d653f32da6c',
'b0f83029-6d38-4f6f-bd30-db44e427f497',
'd5b46baa-37fc-46cc-b99f-c455ce6e6a9c'}
merge = pd.merge(links_from_wd, links_from_mb,
on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
how='outer', indicator=True)
display_df(merge.head())
wd | mbid | name_wd | name_mb | _merge | |
---|---|---|---|---|---|
0 | Q10751910 | 1c70cc38-deee-4a84-9b16-7a81c0f43aed | Q10751910 | cò ke | both |
1 | Q13094251 | f6b76abc-fdef-444c-97b1-27b12e3e3c0b | Q13094251 | kendhang batangan | both |
2 | Q13094253 | 176ce29f-bcf0-47f5-beda-c97ef6df6480 | Q13094253 | kendhang ketipung | both |
3 | Q13094254 | 14acd267-2e1b-459d-b41e-058d2c106345 | Q13094254 | kendhang gendhing | both |
4 | Q13094255 | 4ac4b541-116d-40c1-93c7-160153ca53ac | Q13094255 | kendhang wayangan | both |
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][['name_mb', 'mbid', 'wd']]
display_df(links_to_add_to_wd)
24 links in MB that are not in WD
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][['name_wd', 'wd', 'mbid']]
display_df(links_to_add_to_mb)
name_wd | wd | mbid | |
---|---|---|---|
0 | Rebana | Q3181140 | 551e553a-cadd-4363-8723-f72aab5431d0 |
1 | akkordolia | Q4701390 | bd5fa79a-ea6f-4e11-9463-f6f43cca363c |
2 | bell | Q101401 | c95c7129-d180-4218-afea-4b74ef70e2be |
3 | bellows-blown bagpipe | Q63619194 | d4cbc6fd-5e68-4cf4-afeb-dd2fb4df3c2d |
4 | dilrupa | Q5277044 | 57deb1b0-b7fe-4616-b9cb-bbd59bc0acd8 |
5 | fretless bass | Q932855 | 12f20f43-c71d-4476-8ada-b968aab50900 |
6 | lap slide guitar | Q6488060 | c0ea0405-ae3f-4851-bf85-277fadff80e2 |
7 | scraped idiophone | Q1644824 | dbd1ec09-34fd-412e-b73b-124e46cf7cdf |
8 | tromboon | Q840638 | ee499c78-26df-4698-ab97-c4120276eb1a |
9 links in WD that are not in MB
In those mismatches, some are not recognized because of redirects on WD side: Q54995817 to Q4138014, Q16033036 to Q3181140
no_links_from_mb = sql("""
SELECT
gid AS mbid,
name
FROM
instrument
WHERE
id NOT IN (
SELECT
instrument.id
FROM url
JOIN l_instrument_url AS llu ON llu.entity1 = url.id
JOIN instrument ON llu.entity0 = instrument.id
WHERE
url.url LIKE '%%wikidata.org%%'
)
;
""")
no_links_from_mb.mbid = no_links_from_mb.mbid.apply(str)
display_df(no_links_from_mb)
Exact match between instrument names in WD and MB:
no_links_merge = pd.merge(no_links_from_mb, wd_musical_instruments,
on='name', how='inner', indicator=False)
display_df(no_links_merge)
mbid | name | wd | HornbostelSachs | |
---|---|---|---|---|
0 | ca17a349-e0e3-4b9b-b74d-898a2b54b43e | syrinx | Q10902606 | NaN |
1 | c95c7129-d180-4218-afea-4b74ef70e2be | bell | Q101401 | 111.242 |
2 | c95c7129-d180-4218-afea-4b74ef70e2be | bell | Q96309259 | NaN |
3 | e5408785-e228-4919-9e79-6f997bcdfea5 | tube zither | Q30034781 | 312 |
Using fuzzy-matching to find close instrument names:
import fuzzymatcher
match = fuzzymatcher.fuzzy_left_join(
no_links_from_mb, wd_musical_instruments[['wd', 'name']],
left_on='name', right_on='name')[['best_match_score', 'mbid',
'name_left', 'name_right', 'wd']]
match = match[match['best_match_score'] > 0.09].sort_values(by='best_match_score',
ascending=False)
display_df(match, index=False)
import recordlinkage
# Indexation step
indexer = recordlinkage.SortedNeighbourhoodIndex('name', window=9)
pairs = indexer.index(no_links_from_mb, wd_musical_instruments[['wd', 'name']])
print(len(pairs))
# Comparison step
compare_cl = recordlinkage.Compare()
compare_cl.string('name', 'name', method='jarowinkler',
threshold=0.9, label='name')
features = compare_cl.compute(pairs, no_links_from_mb, wd_musical_instruments[['wd', 'name']])
print(features[features.sum(axis=1) > 0].shape)
# Classification step
linkage = []
for (idx0, idx1) in features[features.sum(axis=1) > 0].index:
linkage.append([
no_links_from_mb.loc[idx0]['mbid'],
no_links_from_mb.loc[idx0]['name'],
wd_musical_instruments.loc[idx1]['name'],
wd_musical_instruments.loc[idx1]['wd'],
])
display_df(pd.DataFrame(linkage, columns=('mbid', 'name_left', 'name_right', 'wd')),
index=False)
681
(5, 1)
mbid | name_left | name_right | wd |
---|---|---|---|
ca17a349-e0e3-4b9b-b74d-898a2b54b43e | syrinx | syrinx | Q10902606 |
c95c7129-d180-4218-afea-4b74ef70e2be | bell | bell | Q101401 |
c95c7129-d180-4218-afea-4b74ef70e2be | bell | bell | Q96309259 |
e5408785-e228-4919-9e79-6f997bcdfea5 | tube zither | tube zither | Q30034781 |
db36bd83-0606-42b9-91a0-d759ba52d0da | trumpet family | trumpet | Q8338 |
import jinja2
template = jinja2.Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Alignment of MusicBrainz and Wikidata Instruments</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Alignment of MusicBrainz and Wikidata Instruments</h1>
<p>Latest MB database update: </p>
<p>Latest update: </p>
<ol>
<li><a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a></li>
<li><a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a></li>
<li><a href="#alignment">Missing alignment suggestions</a>
</ol>
<h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
<h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
<h2 id="alignment">Missing alignment suggestions</h2>
<h3>Alignment on exact names</h3>
<h3>Alignment on fuzzy matching</h3>
</body>
</html>
""")
with open('../docs/wd-instruments-report.html', 'w') as f:
f.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))