Source code for renku.core.dataset.providers.doi

# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DOI API integration."""

import urllib
from pathlib import Path

from renku.core import errors
from renku.core.dataset.providers.api import ImporterApi, ImportProviderInterface, ProviderApi, ProviderPriority
from renku.core.util.doi import extract_doi, is_doi

DOI_BASE_URL = "https://dx.doi.org"


[docs]class DOIProvider(ProviderApi, ImportProviderInterface): """`doi.org <http://doi.org>`_ registry API provider.""" priority = ProviderPriority.HIGHER name = "DOI" is_remote = True def __init__(self, uri: str, headers=None, timeout=3): super().__init__(uri=uri) self.timeout = timeout self.headers = headers if headers is not None else {"accept": "application/vnd.citationstyles.csl+json"}
[docs] @staticmethod def supports(uri) -> bool: """Whether or not this provider supports a given URI.""" return bool(is_doi(uri))
[docs] def get_importer(self, **kwargs) -> "DOIImporter": """Get import manager.""" from renku.core.util import requests def query(doi): """Retrieve metadata for given doi.""" doi = extract_doi(doi) url = make_doi_url(doi) response = requests.get(url, headers=self.headers) if response.status_code != 200: raise LookupError(f"record not found. Status: {response.status_code}") return response def serialize(response): """Serialize HTTP response for DOI.""" json_data = response.json() data = {key.replace("-", "_").lower(): value for key, value in json_data.items()} try: return DOIImporter(**data) except TypeError: raise errors.DatasetImportError("doi metadata could not be serialized") query_response = query(self.uri) return serialize(query_response)
[docs]class DOIImporter(ImporterApi): """Response from `doi.org <http://doi.org>`_ for DOI metadata.""" def __init__( self, id, doi, url, abstract=None, author=None, categories=None, container_title=None, contributor=None, copyright=None, issued=None, language=None, publisher=None, title=None, type=None, version=None, ): super().__init__(uri=url, original_uri=url) self.id = id self.doi = doi self.abstract = abstract self.author = author self.categories = categories self.container_title = container_title self.contributor = contributor self.copyright = copyright self.issued = issued self.language = language self.publisher = publisher self.name = title self.type = type self._version = version @property def version(self) -> str: """Get record version.""" return self._version @property def latest_uri(self) -> str: """Get URI of the latest version.""" return self.uri
[docs] def fetch_provider_dataset(self): """Deserialize this record to a ``ProviderDataset``.""" raise NotImplementedError
[docs] def is_latest_version(self) -> bool: """Check if record is at last possible version.""" return True
[docs] def download_files(self, destination: Path, extract: bool): """Download dataset files from the remote provider.""" raise NotImplementedError
[docs] def tag_dataset(self, name: str) -> None: """Create a tag for the dataset ``name`` if the remote dataset has a tag/version.""" raise NotImplementedError
[docs] def copy_extra_metadata(self, new_dataset) -> None: """Copy provider specific metadata once the dataset is created.""" raise NotImplementedError
[docs]def make_doi_url(doi): """Create URL to access DOI metadata.""" parsed_url = urllib.parse.urlparse(doi) if parsed_url.scheme == "doi": parsed_url = parsed_url._replace(scheme="") doi = parsed_url.geturl() return urllib.parse.urljoin(DOI_BASE_URL, doi)