od2_md_scripts/vocabularies.py at main · OregonDigital/od2_md_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""URI validation functiosn for controlled vocabularies"""
import re
import logging

logger = logging.getLogger(__name__)

def validate_lcnaf(value: str) -> bool:
    """Validate lcnaf URI

    Examples:
    solr query: "http://id.loc.gov/authorities/names/no2013038294"
    solr query: "http://id.loc.gov/authorities/names/nr99003467"
    solr query: "http://id.loc.gov/authorities/names/nb2005019894"
    od2 map: "http://id.loc.gov/authorities/names/n93112029"

    Match exact start from http through names/, then could be n or no or nr or nb, then 8-10 integers
    """
    pattern = r'http:\/\/id\.loc\.gov\/authorities\/names\/(n|no|nr|nb)\d{8,10}$'
    return bool(re.match(pattern, value))

def validate_ulan(value: str) -> bool:
    """Validate ulan URI format

    Examples:
    solr query: "http://vocab.getty.edu/ulan/500012467"
    solr query: "http://vocab.getty.edu/ulan/500006931"
    solr query: "http://vocab.getty.edu/ulan/500330183"
    od2 map: "http://vocab.getty.edu/ulan/500009666"

    Match start through ulan/, then 500, then 6 integers
    """
    # Examples
    pattern = r'http:\/\/vocab.getty.edu\/ulan\/500\d{6}$'
    return bool(re.match(pattern, value))

def validate_creator(value: str) -> bool:
    """Validate creator URI format

    Examples:
    solr query: "http://opaquenamespace.org/ns/creator/HoldenDorbe"
    solr query: "http://opaquenamespace.org/ns/creator/Nexus"
    solr query: "http://opaquenamespace.org/ns/creator/OlsenandJohnson"
    solr query: "http://opaquenamespace.org/ns/creator/BennesJohnV"
    solr query: "http://opaquenamespace.org/ns/creator/OpsisArchitecturearchitecturalfirm"
    od2 map: "http://opaquenamespace.org/ns/creator/HaynesCharles"

    Match start through creator/, then any string with length > 0
    """
    pattern = r'http:\/\/opaquenamespace.org\/ns\/creator\/[a-zA-Z]+$'
    return bool(re.match(pattern, value))

def validate_people(value: str) -> bool:
    """Validate people URI

    Examples:
    solr query: "http://opaquenamespace.org/ns/people/SkinnerJamesEdward18671959"
    solr query: "http://opaquenamespace.org/ns/people/SkinnerSusanLawrence18711952"
    solr query: "http://opaquenamespace.org/ns/people/SteinkeClaytonEJr"
    od2 map: "http://opaquenamespace.org/ns/people/GrayKen"

    Match start through people/, then any number of digits or letters with length > 0
    """
    pattern = r'http:\/\/opaquenamespace.org\/ns\/people\/[a-zA-Z\d]+$'
    return bool(re.match(pattern, value))

def validate_wikidata(value: str) -> bool:
    """Validate wikidata URI format

    Examples:
    from website: "http://www.wikidata.org/entity/Q193020"
    od2 map: "http://www.wikidata.org/entity/Q6134558"

    Match start through entity/, then Q, then digits
    """
    pattern = r'http:\/\/www\.wikidata\.org\/entity\/Q\d+$'
    # More general pattern based on Bulkrax CSV Guidance Document, hot swap this if above pattern breaks
    # pattern = r'http:\/\/www\.wikidata\.org\/entity\/\S+'
    return bool(re.match(pattern, value))

def validate_osuacademicunits(value: str) -> bool:
    """Validate osuacademicunits URI format

    Examples:
    from website: "http://opaquenamespace.org/ns/osuAcademicUnits/5eh7OKFX"
    from website: "http://opaquenamespace.org/ns/osuAcademicUnits/Sp8GIq9b"
    from website: "http://opaquenamespace.org/ns/osuAcademicUnits/LaningEnosRolandJr"
    od2 map: "http://opaquenamespace.org/ns/osuAcademicUnits/smuGLIjL"

    Match start through osuAcademicUnits/, then any number of digits or letters with length > 0
    """
    pattern = r'http:\/\/opaquenamespace\.org\/ns\/osuAcademicUnits\/[a-zA-Z\d]+$'
    return bool(re.match(pattern, value))

def validate_lcsh(value: str) ->  bool:
    """Validate lcsh URI format

    Examples:
    from website: "http://id.loc.gov/authorities/subjects/sh85055245"
    from website: "http://id.loc.gov/authorities/subjects/sh85104841"
    od2 map: "http://id.loc.gov/authorities/subjects/sh85105182"
    TODO slightly suspicious that these all have the same starting 'sh' given that lcnaf didn't, and there's not very many examples in the solr select. Check more extensively

    Match start through subjects/sh, then 8 digits
    """
    pattern = r'http:\/\/id\.loc\.gov\/authorities\/subjects\/sh\d{8}$'
    return bool(re.match(pattern, value))

def validate_tgm(value: str) ->  bool:
    """Validate tgm URI format

    Examples:
    from website: "http://id.loc.gov/vocabulary/graphicMaterials/tgm003035"
    from website: "http://id.loc.gov/vocabulary/graphicMaterials/tgm009453"
    from website: "http://id.loc.gov/vocabulary/graphicMaterials/tgm003961"
    od2 map: "http://id.loc.gov/vocabulary/graphicMaterials/tgm007711"

    Match start through graphicMaterials/tgm, then 6 digits
    """
    pattern = r'http:\/\/id\.loc\.gov\/vocabulary\/graphicMaterials\/tgm\d{6}$'
    return bool(re.match(pattern, value))

def validate_aat(value: str) ->  bool:
    """Validate aat URI format

    Examples:
    website: "http://vocab.getty.edu/aat/300011213"
    website: "http://vocab.getty.edu/aat/300185650"
    od2 map: "http://vocab.getty.edu/aat/300134977"

    Match start through aat/300, then 6 digits
    """
    pattern = r'http:\/\/vocab\.getty\.edu\/aat\/300\d{6}$'
    return bool(re.match(pattern, value))

def validate_subject(value: str) ->  bool:
    """Validate subject URI format

    Examples:
    website: "http://opaquenamespace.org/ns/subject/BiddleMorelandAlice"
    website: "http://opaquenamespace.org/ns/subject/AutzenStadiumEugeneOr"
    od2 map: "http://opaquenamespace.org/ns/subject/Glasswork"

    Match start through subject/, then 1 or more letters
    TODO check if numbers are ok and if this is never empty
    """
    pattern = r'http:\/\/opaquenamespace\.org\/ns\/subject\/[a-zA-Z]+$'
    return bool(re.match(pattern, value))

def validate_lcorgs(value: str) ->  bool:
    """Validate lcorgs URI format

    Examples:
    website: "http://id.loc.gov/vocabulary/organizations/orul"
    website: ""
    od2 map: "http://id.loc.gov/vocabulary/organizations/orumu"

    Match start through organizations/oru, then 1 or more letters
    TODO find more examples
    """
    pattern = r'http:\/\/id\.loc\.gov\/vocabulary\/organizations\/oru[a-zA-Z]+$'
    return bool(re.match(pattern, value))

def validate_itis(value: str) ->  bool:
    """Validate itis URI format

    Examples:
    website: "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=82696"
    website: "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=563984"
    website: "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=175861"
    od2 map: "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=99208"

    Match start through search_value= and then 1 or more integers
    """
    pattern = r'https:\/\/www\.itis\.gov\/servlet\/SingleRpt\/SingleRpt\?search_topic=TSN&search_value=\d+$'
    return bool(re.match(pattern, value))

# Deprecated vocabulary
# def validate_ubio(value: str) ->  bool:
#     """Validate ubio URI format

#     Examples:
#     website: ""
#     website: ""
#     od2 map: "http://ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:2633146"

#     TODO there's no data on this and site seems to be down. Should we remove?
#     Match start through ...
#     """
#     pattern = r''
#     return bool(re.match(pattern, value))

def validate_osubuildings(value: str) ->  bool:
    """Validate osubuildings URI format

    Examples:
    website: "http://opaquenamespace.org/ns/osuBuildings/WomensCenter"
    website: "http://opaquenamespace.org/ns/osuBuildings/GeorgeWPeavyForestScienceCenter"
    od2 map: "http://opaquenamespace.org/ns/osuBuildings/NashHall"

    Match start through osuBuildings/ and then any number letters
    """
    pattern = r'http:\/\/opaquenamespace\.org\/ns\/osuBuildings\/[a-zA-Z]+$'
    return bool(re.match(pattern, value))

def validate_lcgenreforms(value: str) ->  bool:
    """Validate lcsh lcgenreforms format

    Examples:
    website: ""
    website: ""
    od2 map: "http://id.loc.gov/authorities/genreForms/gf2018026004"

    TODO find more examples
    Match start through genreForms/gf then 1 or more integers
    """
    pattern = r'http:\/\/id\.loc\.gov\/authorities\/genreForms\/gf\d+$'
    return bool(re.match(pattern, value))

def validate_bne(value: str) ->  bool:
    """Validate bne URI format

    Examples:
    website: "https://datos.bne.es/resource/XX5094894"
    website: "https://datos.bne.es/resource/XX576905"

    Match start through resource/XX, then 1 or more integers
    """
    pattern = r'https:\/\/datos\.bne\.es\/resource\/XX\d+$'
    # More general pattern based on Bulkrax CSV Guidance Document, hot swap this if above pattern breaks
    # pattern = r'https:\/\/datos\.bne\.es\/resource\/\S+'
    return bool(re.match(pattern, value))

def validate_homosaurus(value: str) ->  bool:
    """Validate homosaurus URI format

    Examples:

    website: "https://homosaurus.org/v4/homoit0001652"
    website: "https://homosaurus.org/v4/homoit0001009"
    website: "https://homosaurus.org/v4/homoit0002075"
    website: "<https://homosaurus.org/v3/homoit0001218"

    Match start through.org/, then either v3 or v4, then /homoit, then 7 integers
    """
    pattern = r'https:\/\/homosaurus\.org\/(v3|v4)\/homoit\d{7}$'
    return bool(re.match(pattern, value))


# If uncommenting a validator, you have to do it here, do the actual function, and in config/validation_mappings.yaml
VOCABULARY_VALIDATORS = {
    'lcnaf': validate_lcnaf,
    'ulan': validate_ulan,
    'creator': validate_creator,
    'people': validate_people,
    'wikidata': validate_wikidata,
    'osuacademicunits': validate_osuacademicunits,
    'lcsh': validate_lcsh,
    'tgm': validate_tgm,
    'aat': validate_aat,
    'subject': validate_subject,
    'lcorgs': validate_lcorgs,
    'itis': validate_itis,
    # 'ubio': validate_ubio,
    'osubuildings': validate_osubuildings,
    'lcgenreforms': validate_lcgenreforms,
    'bne': validate_bne,
    'homosaurus': validate_homosaurus
    # Add rest here
}