Source code for io_collection.keys.get_keys

from pathlib import Path

import boto3


[docs] def get_keys(location: str, prefix: str) -> list[str]: """ Get list of objects at specified location with given prefix. Method will check in the S3 bucket if the location begins with the **s3://** protocol, otherwise it assumes the location is a local path. Parameters ---------- location Object location (local path or S3 bucket). prefix Object key prefix. Returns ------- : List of all object keys at location. """ if location[:5] == "s3://": return _get_keys_on_s3(location[5:], prefix) return _get_keys_on_fs(location, prefix)
def _get_keys_on_fs(path: str, prefix: str) -> list[str]: """ Get list of objects on local file system with given prefix. Parameters ---------- path Local object path. prefix Object key prefix. Returns ------- : List of all object keys on the local file system. """ glob_pattern = f"{prefix}/**/*".replace("//", "/") all_files = Path(path).rglob(glob_pattern) regular_files = [str(file) for file in all_files if not file.is_dir()] return [file.replace(path, "").strip("/") for file in regular_files] def _get_keys_on_s3(bucket: str, prefix: str) -> list[str]: """ Get list of objects in AWS S3 bucket with given prefix. Parameters ---------- bucket AWS S3 bucket name. prefix Object key prefix. Returns ------- : List of all object keys in the AWS bucket. """ s3_client = boto3.client("s3") bucket = bucket.replace("s3://", "") prefix = f"{prefix}/" response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix) get_response = True keys: list[str] = [] while get_response: if "Contents" not in response: break content_keys = [content["Key"] for content in response["Contents"]] keys = keys + content_keys if response["IsTruncated"]: response = s3_client.list_objects_v2( Bucket=bucket, Prefix=prefix, ContinuationToken=response["NextContinuationToken"], ) else: get_response = False return keys