%run -i ../startup.py
import numpy as np
ENTITY_TYPE = 'label'
Last notebook update: 2021-01-28
Importing libs
Defining database parameters
Defining *sql* helper function
Last database update: 2021-01-13
Defining *sparql* helper function
Wikidata entities:
https://www.wikidata.org/wiki/Q18127 record label
Wikidata properties:
https://www.wikidata.org/wiki/Property:P214 VIAF
https://www.wikidata.org/wiki/Property:P268 BNF
https://www.wikidata.org/wiki/Property:P244 LoC
https://www.wikidata.org/wiki/Property:P1955 discogs ID
https://www.wikidata.org/wiki/Property:P966 MusicBrainz label ID
examples
https://www.wikidata.org/wiki/Q885833
links_type_from_wd = sparql("""
SELECT distinct (count(?label) as ?cnt) ?ins ?insLabel
WHERE {
?label wdt:P31 ?ins;
wdt:P966 ?mbid.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?ins ?insLabel
order by DESC(?cnt)
""")
links_type_from_wd[links_type_from_wd.cnt.astype(int) > 10]
cnt | ins | insLabel | |
---|---|---|---|
0 | 5310 | Q18127 | record label |
1 | 766 | Q4830453 | business |
2 | 340 | Q2442401 | record company |
3 | 282 | Q1542343 | independent record label |
4 | 174 | Q2608849 | imprint |
5 | 164 | Q6881511 | enterprise |
6 | 158 | Q2085381 | publisher |
7 | 144 | Q210167 | video game developer |
8 | 114 | Q5354754 | talent agency |
9 | 102 | Q41298 | magazine |
10 | 95 | Q658255 | subsidiary |
11 | 90 | Q1917775 | music publishing company |
12 | 82 | Q1320047 | book publisher |
13 | 80 | Q43229 | organization |
14 | 54 | Q783794 | company |
15 | 52 | Q1137109 | video game publisher |
16 | 43 | Q14350 | radio station |
17 | 41 | Q1762059 | film production company |
18 | 33 | Q1047437 | copyright collective |
19 | 31 | Q35127 | website |
20 | 30 | Q167270 | trademark |
21 | 26 | Q12540664 | distributor |
22 | 26 | Q20739124 | entertainment company |
23 | 25 | Q219577 | holding company |
24 | 22 | Q1616075 | television station |
25 | 21 | Q1107679 | animation studio |
26 | 21 | Q1002697 | periodical |
27 | 18 | Q3719391 | sheet music publisher |
28 | 18 | Q163740 | nonprofit organization |
29 | 17 | Q15265344 | broadcaster |
30 | 17 | Q215380 | musical group |
31 | 17 | Q11032 | newspaper |
32 | 16 | Q1110794 | daily newspaper |
33 | 16 | Q708676 | charitable organization |
34 | 16 | Q32178211 | music organization |
35 | 15 | Q1824338 | music magazine |
36 | 14 | Q10689397 | television production company |
37 | 14 | Q726870 | brick and mortar |
38 | 14 | Q778575 | conglomerate |
39 | 13 | Q2001305 | television channel |
40 | 12 | Q1802587 | German public state broadcaster |
41 | 12 | Q431289 | brand |
42 | 12 | Q1331793 | media company |
43 | 12 | Q1001388 | budget label |
44 | 11 | Q2243978 | record shop |
45 | 11 | Q1589009 | privately held company |
46 | 11 | Q1660312 | major label |
# linked to MB label
links_from_wd = sparql("""
SELECT (?label AS ?wd) ?mbid ?labelLabel ?lc ?viaf ?discogs
WHERE {
?label wdt:P966 ?mbid .
OPTIONAL { ?label wdt:P7320 ?lc . }
# OPTIONAL { ?label wdt:P214 ?viaf . }
# OPTIONAL { ?label wdt:P1955 ?discogs . }
# OPTIONAL { ?label wdt:P268 ?bnf . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?labelLabel)
""")
links_from_wd.rename(columns={'labelLabel': 'name'}, inplace=True)
links_from_wd.lc = links_from_wd.lc.apply(
lambda lc: lc if isinstance(lc, str) else '')
#links_from_wd.discogs = links_from_wd.discogs.apply(
# lambda discogs: discogs if isinstance(discogs, str) else '')
print('Count:', len(links_from_wd))
display_df(links_from_wd.head())
Count: 7771
wd | mbid | name | lc | |
---|---|---|---|---|
0 | Q1017332 | fc9526f3-dded-4ce5-8138-b8011c9d3bd2 | Q1017332 | |
1 | Q102226212 | d927bbc7-5db0-44cc-9b3f-efde37850194 | Q102226212 | |
2 | Q102276807 | af5e3a19-b40b-4269-b15b-9cb2686fde43 | Q102276807 | |
3 | Q10316316 | 1b4f4f26-37e5-4e19-be79-3515c9ce5961 | Q10316316 | |
4 | Q10316316 | a32f2beb-5297-4f3e-b0b4-d42ddbbf9a2b | Q10316316 |
links_from_mb = sql("""
SELECT
url.url AS wd,
label.gid AS mbid,
label.name,
label.label_code AS lc
FROM
label
JOIN l_label_url AS llu ON llu.entity0 = label.id
JOIN url ON llu.entity1 = url.id
WHERE
url.url LIKE '%%wikidata.org%%'
ORDER BY
label.name
;
""")
links_from_mb.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb.mbid = links_from_mb.mbid.apply(str)
links_from_mb.lc = links_from_mb.lc.apply(
lambda lc: '' if np.isnan(lc) else str(int(lc)))
print('Count:', len(links_from_mb))
display_df(links_from_mb.head())
Count: 7033
wd | mbid | name | lc | |
---|---|---|---|---|
0 | Q315760 | 893364ff-1830-4694-833d-93751f14f984 | !K7 | 7306 |
1 | Q4544982 | becff808-7fa6-4f85-ab15-0444bca5d04c | +1 Records | |
2 | Q58209721 | 3129e8bf-cd5c-428d-b79a-51f77445a3f9 | 1 Numara Plakçılık | |
3 | Q4545734 | 7441004d-1165-4c04-b029-7c8d59c5912a | 1-2-3-4 Go! Records | |
4 | Q3983384 | 5a8514dd-22d2-4def-a48d-3a8b71a8e24f | 10 Records | 3098 |
duplicate_wd = links_from_mb[[
'wd', 'mbid', 'name']].groupby('wd').filter(
lambda row: len(row.mbid) > 1).sort_values('wd')
print('Count:', len(duplicate_wd))
display_df(duplicate_wd.head())
Count: 622
wd | mbid | name | |
---|---|---|---|
0 | Q1011446 | 2f8b7a22-70c3-457c-bb57-194236f9435e | Gold Record |
1 | Q1011446 | 3737052c-6104-4c0c-9fff-cab022fc8800 | Gold Record |
2 | Q10263636 | 6ca8e712-4a92-4f9a-b180-16bd7d157176 | Dance World Attack |
3 | Q10263636 | bd0487d9-1740-4426-8740-85a7a2375c33 | DWA (Dance World Attack) |
4 | Q10290705 | d2e62ad3-2530-4322-9e72-0d3830aaee82 | Gospel Records |
duplicate_mb = links_from_mb[['wd', 'mbid', 'name']].groupby('mbid').filter(
lambda row: len(row.mbid) > 1).sort_values('mbid')
print('Count:', len(duplicate_mb))
display_df(duplicate_mb.head())
Count: 18
wd | mbid | name | |
---|---|---|---|
0 | Q50021153 | 158b407e-26d7-48d9-b6ba-639861e1bffc | Zafiro |
1 | Q50020430 | 158b407e-26d7-48d9-b6ba-639861e1bffc | Zafiro |
2 | Q55908907 | 1ff104ed-0536-4dfd-b3f4-2b053ea66f32 | Capitol Records Nashville |
3 | Q5035920 | 1ff104ed-0536-4dfd-b3f4-2b053ea66f32 | Capitol Records Nashville |
4 | Q8602939 | 5b34c929-fd09-4b57-a8bf-1e8eb0f21c12 | Lost Highway Records |
merge = pd.merge(links_from_wd, links_from_mb,
on=['wd', 'mbid'], suffixes=('_wd', '_mb'),
how='outer', indicator=True)
display_df(merge.head())
wd | mbid | name_wd | lc_wd | name_mb | lc_mb | _merge | |
---|---|---|---|---|---|---|---|
0 | Q1017332 | fc9526f3-dded-4ce5-8138-b8011c9d3bd2 | Q1017332 | BuschFunk | 6312 | both | |
1 | Q102226212 | d927bbc7-5db0-44cc-9b3f-efde37850194 | Q102226212 | NaN | NaN | left_only | |
2 | Q102276807 | af5e3a19-b40b-4269-b15b-9cb2686fde43 | Q102276807 | NaN | NaN | left_only | |
3 | Q10316316 | 1b4f4f26-37e5-4e19-be79-3515c9ce5961 | Q10316316 | 星光唱片 | both | ||
4 | Q10316316 | a32f2beb-5297-4f3e-b0b4-d42ddbbf9a2b | Q10316316 | Star Records Ltd. | both |
# link in mb but missing in wd
links_to_add_to_wd = merge.loc[lambda x : x['_merge']=='right_only'][[
'name_mb', 'mbid', 'wd']]
print('Count:', len(links_to_add_to_wd))
display_df(links_to_add_to_wd.head())
Count: 589
name_mb | mbid | wd | |
---|---|---|---|
0 | 125 | 13b6890a-0b41-4271-9f18-51d2f344cb2c | Q4548425 |
1 | 20|20|20 | b1a86e1f-02c7-42c4-a741-fd3fd6e1c1a1 | Q98226934 |
2 | 3517 Records | 3032dfb9-6bbc-4baa-8819-62d2bff7bc84 | Q96515410 |
3 | 407 Records | 4e61e8b0-7328-4e78-a663-a2d5bfbd0d17 | Q63133237 |
4 | 50/50innertainment | 1bd94d7c-0dd4-45a5-824c-dd434d36fd32 | Q23055027 |
# link in wd but missing in mb
links_to_add_to_mb = merge.loc[lambda x : x['_merge']=='left_only'][[
'name_wd', 'wd', 'mbid']]
links_to_add_to_mb['edit_link'] = links_to_add_to_mb.apply(
mb_label_edit_wd_link, axis=1)
print('Count:', len(links_to_add_to_mb))
display_df(links_to_add_to_mb.head())
Count: 1328
name_wd | wd | mbid | edit_link | |
---|---|---|---|---|
0 | Q102226212 | Q102226212 | d927bbc7-5db0-44cc-9b3f-efde37850194 | edit |
1 | Q102276807 | Q102276807 | af5e3a19-b40b-4269-b15b-9cb2686fde43 | edit |
2 | Q10564076 | Q10564076 | 14bfabf3-2cab-4f74-bac5-52c8ccce7b47 | edit |
3 | Q11175283 | Q11175283 | 33ccae50-a5a4-46bc-85ec-f65e111e56e8 | edit |
4 | Q11284795 | Q11284795 | 8c24cbcc-ff67-4f21-8b1f-6daa5366e54a | edit |
Example: https://musicbrainz.org/label/85bb6180-ac99-46b5-a3ec-92967e88f842 is not linked to https://www.wikidata.org/wiki/Q56809543
https://musicbrainz.org/label/86e1fce2-a61e-4d08-9d4f-b91d602f995b is linked to https://www.wikidata.org/wiki/Q24950167 that was removed in https://www.wikidata.org/w/index.php?title=Special:Log&logid=661147435
https://musicbrainz.org/label/69de915d-e7b5-4739-bd21-2de1099a0610 is linked to https://www.wikidata.org/wiki/Q4146440 and not the opposite
lc_to_add_to_mb = merge.loc[
(merge._merge=='both') & (merge.lc_mb == '') & (merge.lc_wd != '')
][['mbid', 'wd', 'lc_wd']].sort_values(by='lc_wd')
lc_to_add_to_mb['edit_link'] = lc_to_add_to_mb.apply(
mb_label_edit_lc_link, axis=1)
print('Count:', len(lc_to_add_to_mb))
display_df(lc_to_add_to_mb)
Count: 20
# linked to MB label
links_from_wd_with_bnf = sparql("""
SELECT (?label AS ?wd) ?mbid ?labelLabel ?bnf
WHERE {
?label wdt:P966 ?mbid .
?label wdt:P268 ?bnf .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ASC(?labelLabel)
""")
links_from_wd_with_bnf.rename(columns={'labelLabel': 'name'}, inplace=True)
print('Count:', len(links_from_wd_with_bnf))
display_df(links_from_wd_with_bnf.head())
Count: 459
wd | mbid | name | bnf | |
---|---|---|---|---|
0 | Q2982799 | 6bac79db-1cda-4c50-b02a-0d275807725d | Q2982799 | 12454777n |
1 | Q3269707 | 7f8729af-19ea-4aef-a951-fee47ff2f59f | Q3269707 | 13889093m |
2 | Q52721802 | 45cd3e9d-7c41-4875-a561-4df7c4e46812 | Q52721802 | 16765226w |
3 | Q77202 | 45fe499a-d335-44a9-9f84-e17ccf97152a | 1C Company | 145534620 |
4 | Q97446731 | ed0d5aff-e23f-4239-950f-bae0bc201e6c | 20th Century Records | 13884815t |
links_from_mb_with_bnf = sql("""
SELECT
url.url AS wd,
label.gid AS mbid,
label.name
FROM label
JOIN l_label_url AS llu ON llu.entity0 = label.id
JOIN url ON llu.entity1 = url.id
WHERE
url.url LIKE '%%bnf.fr%%'
ORDER BY label.name
;
""")
links_from_mb_with_bnf.wd = links_from_mb.wd.apply(lambda s: s.split('/')[-1])
links_from_mb_with_bnf.mbid = links_from_mb.mbid.apply(str)
print('Count:', len(links_from_mb_with_bnf))
display_df(links_from_mb_with_bnf.head())
Count: 102
wd | mbid | name | |
---|---|---|---|
0 | Q315760 | 893364ff-1830-4694-833d-93751f14f984 | 10 Records Ltd. |
1 | Q4544982 | becff808-7fa6-4f85-ab15-0444bca5d04c | 1NC@ |
2 | Q58209721 | 3129e8bf-cd5c-428d-b79a-51f77445a3f9 | 21 Records |
3 | Q4545734 | 7441004d-1165-4c04-b029-7c8d59c5912a | A&M Records |
4 | Q3983384 | 5a8514dd-22d2-4def-a48d-3a8b71a8e24f | ACPE |
report x missing WD link x missing LC / diverging LC missing discogs /div missing viaf / div missing bnf / div
common LC but no direct MB/WD link common discogs viaf bnf
wikipedia link and no wikidata link
import jinja2
template = jinja2.Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Alignment of MusicBrainz and Wikidata Record Labels</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Alignment of MusicBrainz and Wikidata Record Labels</h1>
<p>Latest MB database update: </p>
<p>Latest update: </p>
<ol>
<li>
<a href="#lc">Add missing Label Codes</a>
( rows)
</li>
<li>
<a href="#wddup">MusicBrainz entities sharing a Wikidata link</a>
( rows)
</li>
<li>
<a href="#mbdup">Wikidata entities sharing a MusicBrainz link</a>
( rows)
</li>
<li>
<a href="#wd2mb">Add missing Wikidata links to MusicBrainz</a>
( rows)
</li>
<li>
<a href="#mb2wd">Add missing MusicBrainz links to Wikidata</a>
( rows)
</li>
</ol>
<h2 id="lc">Add missing Label Codes</h2>
<h2 id="wddup">MusicBrainz entities sharing a Wikidata link</h2>
<h2 id="mbdup">Wikidata entities sharing a MusicBrainz link</h2>
<h2 id="wd2mb">Add missing Wikidata links to MusicBrainz</h2>
<h2 id="mb2wd">Add missing MusicBrainz links to Wikidata</h2>
</body>
</html>
""")
with open('../docs/wd-recordlabels-report.html', 'w') as f:
f.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))