229 lines
5.4 KiB
Python
229 lines
5.4 KiB
Python
"""Dataset paths, identifiers, and filenames
|
|
|
|
Note: this module is not part of Rasterio's API. It is for internal use
|
|
only.
|
|
|
|
"""
|
|
|
|
import os
|
|
import pathlib
|
|
import re
|
|
import sys
|
|
from urllib.parse import urlparse
|
|
|
|
import attr
|
|
|
|
from rasterio.errors import PathError
|
|
|
|
# Supported URI schemes and their mapping to GDAL's VSI suffix.
|
|
# TODO: extend for other cloud platforms.
|
|
SCHEMES = {
|
|
'ftp': 'curl',
|
|
'gzip': 'gzip',
|
|
'http': 'curl',
|
|
'https': 'curl',
|
|
's3': 's3',
|
|
'tar': 'tar',
|
|
'zip': 'zip',
|
|
'file': 'file',
|
|
'oss': 'oss',
|
|
'gs': 'gs',
|
|
'az': 'az',
|
|
}
|
|
|
|
ARCHIVESCHEMES = set
|
|
CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
|
|
|
|
# TODO: extend for other cloud platforms.
|
|
REMOTESCHEMES = {
|
|
k
|
|
for k, v in SCHEMES.items()
|
|
if v
|
|
in (
|
|
"curl",
|
|
"s3",
|
|
"oss",
|
|
"gs",
|
|
"az",
|
|
)
|
|
}
|
|
|
|
|
|
class _Path:
|
|
"""Base class for dataset paths"""
|
|
|
|
def as_vsi(self):
|
|
return _vsi_path(self)
|
|
|
|
|
|
@attr.s(slots=True)
|
|
class _ParsedPath(_Path):
|
|
"""Result of parsing a dataset URI/Path
|
|
|
|
Attributes
|
|
----------
|
|
path : str
|
|
Parsed path. Includes the hostname and query string in the case
|
|
of a URI.
|
|
archive : str
|
|
Parsed archive path.
|
|
scheme : str
|
|
URI scheme such as "https" or "zip+s3".
|
|
"""
|
|
path = attr.ib()
|
|
archive = attr.ib()
|
|
scheme = attr.ib()
|
|
|
|
@classmethod
|
|
def from_uri(cls, uri):
|
|
parts = urlparse(uri)
|
|
if sys.platform == "win32" and re.match(r"^[a-zA-Z]\:", parts.netloc):
|
|
parsed_path = f"{parts.netloc}{parts.path}"
|
|
parsed_netloc = None
|
|
else:
|
|
parsed_path = parts.path
|
|
parsed_netloc = parts.netloc
|
|
|
|
path = parsed_path
|
|
scheme = parts.scheme or None
|
|
|
|
if parts.query:
|
|
path += "?" + parts.query
|
|
|
|
if scheme and scheme.startswith(("gzip", "tar", "zip")):
|
|
path_parts = path.split('!')
|
|
path = path_parts.pop() if path_parts else None
|
|
archive = path_parts.pop() if path_parts else None
|
|
else:
|
|
archive = None
|
|
|
|
if scheme and parsed_netloc:
|
|
if archive:
|
|
archive = parsed_netloc + archive
|
|
else:
|
|
path = parsed_netloc + path
|
|
|
|
return _ParsedPath(path, archive, scheme)
|
|
|
|
@property
|
|
def name(self):
|
|
"""The parsed path's original URI"""
|
|
if not self.scheme:
|
|
return self.path
|
|
elif self.archive:
|
|
return f"{self.scheme}://{self.archive}!{self.path}"
|
|
else:
|
|
return f"{self.scheme}://{self.path}"
|
|
|
|
@property
|
|
def is_remote(self):
|
|
"""Test if the path is a remote, network URI"""
|
|
return bool(self.scheme) and self.scheme.split("+")[-1] in REMOTESCHEMES
|
|
|
|
@property
|
|
def is_local(self):
|
|
"""Test if the path is a local URI"""
|
|
return not self.scheme or (self.scheme and self.scheme.split('+')[-1] not in REMOTESCHEMES)
|
|
|
|
|
|
@attr.s(slots=True)
|
|
class _UnparsedPath(_Path):
|
|
"""Encapsulates legacy GDAL filenames
|
|
|
|
Attributes
|
|
----------
|
|
path : str
|
|
The legacy GDAL filename.
|
|
"""
|
|
path = attr.ib()
|
|
|
|
@property
|
|
def name(self):
|
|
"""The unparsed path's original path"""
|
|
return self.path
|
|
|
|
|
|
def _parse_path(path):
|
|
"""Parse a dataset's identifier or path into its parts
|
|
|
|
Parameters
|
|
----------
|
|
path : str or path-like object
|
|
The path to be parsed.
|
|
|
|
Returns
|
|
-------
|
|
ParsedPath or UnparsedPath
|
|
|
|
Notes
|
|
-----
|
|
When legacy GDAL filenames are encountered, they will be returned
|
|
in a UnparsedPath.
|
|
|
|
"""
|
|
if isinstance(path, _Path):
|
|
return path
|
|
elif isinstance(path, pathlib.PurePath):
|
|
return _ParsedPath(os.fspath(path), None, None)
|
|
elif isinstance(path, str):
|
|
if sys.platform == "win32" and re.match(r"^[a-zA-Z]\:", path):
|
|
return _ParsedPath(path, None, None)
|
|
elif path.startswith('/vsi'):
|
|
return _UnparsedPath(path)
|
|
else:
|
|
parts = urlparse(path)
|
|
else:
|
|
raise PathError(f"invalid path '{path!r}'")
|
|
|
|
# if the scheme is not one of Rasterio's supported schemes, we
|
|
# return an UnparsedPath.
|
|
if parts.scheme:
|
|
if all(p in SCHEMES for p in parts.scheme.split('+')):
|
|
return _ParsedPath.from_uri(path)
|
|
|
|
return _UnparsedPath(path)
|
|
|
|
|
|
def _vsi_path(path):
|
|
"""Convert a parsed path to a GDAL VSI path
|
|
|
|
Parameters
|
|
----------
|
|
path : Path
|
|
A ParsedPath or UnparsedPath object.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
|
|
"""
|
|
if isinstance(path, _UnparsedPath):
|
|
return path.path
|
|
|
|
elif isinstance(path, _ParsedPath):
|
|
|
|
if not path.scheme:
|
|
return path.path
|
|
|
|
else:
|
|
if path.scheme.split('+')[-1] in CURLSCHEMES:
|
|
suffix = '{}://'.format(path.scheme.split('+')[-1])
|
|
else:
|
|
suffix = ''
|
|
|
|
prefix = "/".join(
|
|
f"vsi{SCHEMES[p]}" for p in path.scheme.split("+") if p != "file"
|
|
)
|
|
|
|
if prefix:
|
|
if path.archive:
|
|
result = '/{}/{}{}/{}'.format(prefix, suffix, path.archive, path.path.lstrip('/'))
|
|
else:
|
|
result = f"/{prefix}/{suffix}{path.path}"
|
|
else:
|
|
result = path.path
|
|
return result
|
|
|
|
else:
|
|
raise ValueError("path must be a ParsedPath or UnparsedPath object")
|