diff --git a/share/harvesters/org_swbiodiversity.py b/share/harvesters/org_swbiodiversity.py index cee3ac97c..e622fb8fe 100644 --- a/share/harvesters/org_swbiodiversity.py +++ b/share/harvesters/org_swbiodiversity.py @@ -61,4 +61,4 @@ def fetch_records(self): record = raw_data.find(id='innertext') - yield identifier, str(record) + yield collection_page.url, str(record) diff --git a/share/transformers/org_swbiodiversity.py b/share/transformers/org_swbiodiversity.py index b16512096..9de531c1c 100644 --- a/share/transformers/org_swbiodiversity.py +++ b/share/transformers/org_swbiodiversity.py @@ -80,6 +80,7 @@ def unwrap_data(self, input_data): description = self.extract_text(start.find_next()) if description: data['description'] = description + if start: body = start.find_all_next(style='margin-top:5px;') body = list(map(self.extract_text, body)) @@ -90,10 +91,26 @@ def unwrap_data(self, input_data): contact_dict = {} contact = entry.replace('Contact:', '').strip() contact_email = contact[contact.find("(") + 1:contact.find(")")] + if re.match(r"[^@\s]+@[^@\s]+\.[^@\s]+", contact_email): + if '/' in contact_email: + contact_email = contact_email.split('/')[0] + if ',' in contact_email: + contact_email = contact_email.split(',')[0] + else: + contact_email = None + contact_name = contact.split('(', 1)[0].strip() - if ', Curator' in contact_name: - contact_name = contact_name.replace(', Curator', '').strip() - if contact and contact_email and re.match(r"[^@]+@[^@]+\.[^@]+", contact_email): + remove_list = ['Curator', 'Science Division Chair', + 'Collections Manager', 'Administrative Director', + 'Director', 'Director and Curator', + 'Mycologist and Director', ','] + for item in remove_list: + insensitive_item = re.compile(re.escape(item), re.IGNORECASE) + contact_name = insensitive_item.sub('', contact_name) + if '/' in contact_name: + contact_name = contact_name.split('/')[0] + + if contact and contact_email: contact_dict['email'] = contact_email if contact_name: contact_dict['name'] = contact_name @@ -125,6 +142,7 @@ def unwrap_data(self, input_data): collection_statistics = start.find_all_next('li') collection_statistics = list(map(self.extract_text, collection_statistics)) data['collection-statistics'] = self.process_collection_stat(collection_statistics) + return data def extract_text(self, text): diff --git a/tests/share/harvesters/test_swbiodiversity_harvester.py b/tests/share/harvesters/test_swbiodiversity_harvester.py index ebea1eef6..d58255e68 100644 --- a/tests/share/harvesters/test_swbiodiversity_harvester.py +++ b/tests/share/harvesters/test_swbiodiversity_harvester.py @@ -113,7 +113,7 @@ def test_swbiodiversity_harvester(): end = pendulum.utcnow() result = harvester._do_fetch(start, end) for data in result: - assert data[0] == '223' + assert data[0] == collection.url assert "".join(data[1].split()) == "".join('''

SEINet - Arizona Chapter Collections