Skip to content

Dataset handler

BladesightDatasetDirectory

This object is used to access datasets from the Bladesight Data bucket on S3.

It also lists the local datasets.

Examples:

Load a dataset into memory:

>>> Datasets = BladesightDatasetDirectory()
>>> dataset = Datasets["data/intro_to_btt/intro_to_btt_ch02"]
>>> df_table = dataset["table/dataset_1"]
Source code in bladesight/dataset_handler.py
class BladesightDatasetDirectory:
    """This object is used to access datasets from the 
    Bladesight Data bucket on S3. 

    It also lists the local datasets.

    Examples:
    ---------
    Load a dataset into memory:

        >>> Datasets = BladesightDatasetDirectory()
        >>> dataset = Datasets["data/intro_to_btt/intro_to_btt_ch02"]
        >>> df_table = dataset["table/dataset_1"]
    """
    def __init__(self):
        self.path = get_path_to_local_bladesight()
        self.local_datasets = [
            self._replace_path_prefix(i) for i in get_local_datasets()
        ]
        self._refresh_available_datasets()

    @staticmethod
    def _getitem_key_correct_format(key: str) -> bool:
        """This function checks if the key is in the correct format. The key
        should be in the format "data/intro_to_btt/intro_to_btt_ch02".

        Args:
            key (str): The key to check.

        Returns:
            bool: True if the key is in the correct format, False otherwise.

        Examples:
        ---------
        Check if a key is in the correct format:

            >>> BladesightDatasetDirectory._getitem_key_correct_format(
            ... "data/intro_to_btt/intro_to_btt_ch02"
            ... )
            True

            >>> BladesightDatasetDirectory._getitem_key_correct_format(
            ... "intro_to_btt/intro_to_btt_ch02"
            ... )
            False
        """
        if key.startswith("data/"):
            return True
        return False

    def __getitem__(self, key: str) -> Dataset:
        """Get the dataset specified by a key. If the dataset is not found, it
        will be downloaded from the Bladesight Data bucket.

        Args:
            key (str): The name of the dataset.

        Raises:
            KeyError: If the dataset is not found.

        Returns:
            Dataset: The dataset.

        Examples:
        ---------
        Load a dataset into memory:

            >>> Datasets = BladesightDatasetDirectory()
            >>> dataset = Datasets["data/intro_to_btt/intro_to_btt_ch02"]
        """
        if self._getitem_key_correct_format(key) is False:
            raise KeyError(
                f"Dataset {key} does not start with data/. The key should be in the format 'data/../../etc'."
            )

        for local_dataset in self.local_datasets:
            homogenized_local_name = self._replace_path_prefix(local_dataset)
            if key == homogenized_local_name:
                return Dataset(self.path / pathlib.Path(local_dataset + ".db"))
        else:
            # Download the dataset from the online datasets
            for online_set in self.online_datasets:
                homogenized_online_name = self._replace_path_prefix(online_set)
                if key == homogenized_online_name:
                    download_dataset_from_bladesight_data(
                        self._replace_path_prefix(key, BLADESIGHT_DATASETS_S3_BUCKET)
                    )
                    self.local_datasets = get_local_datasets()
                    return self[key]

            else:
                raise KeyError(f"Dataset {key} not found.")

    @staticmethod
    def _replace_path_prefix(
        dataset_full_path: str, replace_prefix: str = "data"
    ) -> str:
        """This function is used to replace the first path prefix with the
            replace_prefix argument. For example, if the dataset path
            is "bladesight-data/intro_to_btt/intro_to_btt_ch02", and the
            replace_prefix is "data", and the path is returned as
            "data/intro_to_btt/intro_to_btt_ch02".

        Args:
            dataset_full_path (str): The full path to the dataset.
            replace_prefix (str, optional): The prefix to replace. Defaults to "data".

        Returns:
            str: The new path.

        Examples:
        ---------
        Replace the first path prefix with "data":

            >>> _replace_path_prefix(
            ... "bladesight-data/intro_to_btt/intro_to_btt_ch02", 
            ... "data"
            ... )
            "data/intro_to_btt/intro_to_btt_ch02"
        """
        new_path = [replace_prefix] + dataset_full_path.split("/")[1:]
        return "/".join(new_path)

    def _ipython_key_completions_(self):
        """ 
        We replace whatever prefix is in the 
        dataset with "data" 
        """
        return [self._replace_path_prefix(i) for i in self.online_datasets]

    def _refresh_available_datasets(self):
        """
        This function refreshes the local and online datasets.
        If the online datasets cannot be read, it will only 
        list the local datasets.
        """
        self.local_datasets = get_local_datasets()
        try:
            self.online_datasets = get_bladesight_datasets()
        except Exception as _:
            print("Could not read remote datasets. Only listing local datasets")
            self.online_datasets = self.local_datasets

__getitem__(key)

Get the dataset specified by a key. If the dataset is not found, it will be downloaded from the Bladesight Data bucket.

Parameters:

Name Type Description Default
key str

The name of the dataset.

required

Raises:

Type Description
KeyError

If the dataset is not found.

Returns:

Name Type Description
Dataset Dataset

The dataset.

Examples:

Load a dataset into memory:

>>> Datasets = BladesightDatasetDirectory()
>>> dataset = Datasets["data/intro_to_btt/intro_to_btt_ch02"]
Source code in bladesight/dataset_handler.py
def __getitem__(self, key: str) -> Dataset:
    """Get the dataset specified by a key. If the dataset is not found, it
    will be downloaded from the Bladesight Data bucket.

    Args:
        key (str): The name of the dataset.

    Raises:
        KeyError: If the dataset is not found.

    Returns:
        Dataset: The dataset.

    Examples:
    ---------
    Load a dataset into memory:

        >>> Datasets = BladesightDatasetDirectory()
        >>> dataset = Datasets["data/intro_to_btt/intro_to_btt_ch02"]
    """
    if self._getitem_key_correct_format(key) is False:
        raise KeyError(
            f"Dataset {key} does not start with data/. The key should be in the format 'data/../../etc'."
        )

    for local_dataset in self.local_datasets:
        homogenized_local_name = self._replace_path_prefix(local_dataset)
        if key == homogenized_local_name:
            return Dataset(self.path / pathlib.Path(local_dataset + ".db"))
    else:
        # Download the dataset from the online datasets
        for online_set in self.online_datasets:
            homogenized_online_name = self._replace_path_prefix(online_set)
            if key == homogenized_online_name:
                download_dataset_from_bladesight_data(
                    self._replace_path_prefix(key, BLADESIGHT_DATASETS_S3_BUCKET)
                )
                self.local_datasets = get_local_datasets()
                return self[key]

        else:
            raise KeyError(f"Dataset {key} not found.")

Dataset

This object is used to access data from a dataset.

Parameters:

Name Type Description Default
path Path

The path to the dataset.

required

Examples:


>>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
>>> dataset.tables
['dataset_1', 'dataset_2']
>>> dataset.metadata
{
    "CITATION": {
        "repr": "This is a citation",
        "url": "https://example.com",
        "doi": "10.1234/5678"
    }
}
>>> dataset.set_dataframe_library("pl")
>>> df_table = dataset["table/dataset_1"]
>>> dataset.print_citation()
Source code in bladesight/dataset_handler.py
class Dataset:
    """This object is used to access data from a dataset.

    Args:
        path (pathlib.Path): The path to the dataset.

    Examples:
    ---------
        >>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
        >>> dataset.tables
        ['dataset_1', 'dataset_2']
        >>> dataset.metadata
        {
            "CITATION": {
                "repr": "This is a citation",
                "url": "https://example.com",
                "doi": "10.1234/5678"
            }
        }
        >>> dataset.set_dataframe_library("pl")
        >>> df_table = dataset["table/dataset_1"]
        >>> dataset.print_citation()
    """
    def __init__(self, path: pathlib.Path):
        _confirm_dataset_is_valid(path)
        self.path = path
        self.tables: List[str] = _get_db_tables(self.path)
        self.metadata: Dict[str, Dict] = _get_all_metadata(self.path)
        self.dataframe_library: Literal["pd", "pl"] = "pd"
        self.print_citation()

    def set_dataframe_library(self, library: Literal["pd", "pl"]):
        """This function sets the dataframe library to 
        use when returning data.

        Args:
            library (Literal['pd', 'pl']): The dataframe library to use.

        Raises:
            ValueError: If the library is not 'pd' or 'pl'.

        Examples:
        ---------
            Set the dataframe library to polars.

            >>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
            >>> dataset.set_dataframe_library("pl")
        """
        if library in ["pd", "pl"]:
            self.dataframe_library = library
        else:
            raise ValueError("library must be 'pd' or 'pl'")    

    def __getitem__(self, key: str) -> Union[pd.DataFrame, pl.DataFrame]:
        """ This function returns a table from the dataset.

        Args:
            key (str): The name of the table, prefixed with "table/".

        Raises:
            KeyError: If the table is not found.

        Returns:
            Union[pd.DataFrame, pl.DataFrame]: The table.

        Examples:
        ---------
            Load a table from the dataset into memory:

            >>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
            >>> df_table = dataset["table/dataset_1"]
        """
        table_name = key.replace("table/", "")
        if table_name in self.tables:
            return _read_sql(
                self.path,
                f"SELECT * FROM {table_name};", 
                return_mode=self.dataframe_library
            )
        else:
            raise KeyError(
                f"Table {table_name} not found. These are the tables in the dataset: {self.tables}"
            )

    def print_citation(self):
        """Print the citation provided in the metadata table."""
        print(_get_printable_citation(self.metadata))

    def _ipython_key_completions_(self):
        return ["table/" + i for i in self.tables]

    def __repr__(self) -> str:
        """Show the dataset and its tables.

        Returns:
            str: The dataset and its tables in a string.
        """
        table_string = "[\n"
        for table in self.tables:
            table_string += f"\t'table/{table}',\n "
        table_string += "]"
        return f"Dataset({self.path}),\n\n Tables: \n {table_string}"

__getitem__(key)

This function returns a table from the dataset.

Parameters:

Name Type Description Default
key str

The name of the table, prefixed with "table/".

required

Raises:

Type Description
KeyError

If the table is not found.

Returns:

Type Description
Union[DataFrame, DataFrame]

Union[pd.DataFrame, pl.DataFrame]: The table.

Examples:


Load a table from the dataset into memory:

>>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
>>> df_table = dataset["table/dataset_1"]
Source code in bladesight/dataset_handler.py
def __getitem__(self, key: str) -> Union[pd.DataFrame, pl.DataFrame]:
    """ This function returns a table from the dataset.

    Args:
        key (str): The name of the table, prefixed with "table/".

    Raises:
        KeyError: If the table is not found.

    Returns:
        Union[pd.DataFrame, pl.DataFrame]: The table.

    Examples:
    ---------
        Load a table from the dataset into memory:

        >>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
        >>> df_table = dataset["table/dataset_1"]
    """
    table_name = key.replace("table/", "")
    if table_name in self.tables:
        return _read_sql(
            self.path,
            f"SELECT * FROM {table_name};", 
            return_mode=self.dataframe_library
        )
    else:
        raise KeyError(
            f"Table {table_name} not found. These are the tables in the dataset: {self.tables}"
        )

__repr__()

Show the dataset and its tables.

Returns:

Name Type Description
str str

The dataset and its tables in a string.

Source code in bladesight/dataset_handler.py
def __repr__(self) -> str:
    """Show the dataset and its tables.

    Returns:
        str: The dataset and its tables in a string.
    """
    table_string = "[\n"
    for table in self.tables:
        table_string += f"\t'table/{table}',\n "
    table_string += "]"
    return f"Dataset({self.path}),\n\n Tables: \n {table_string}"

print_citation()

Print the citation provided in the metadata table.

Source code in bladesight/dataset_handler.py
def print_citation(self):
    """Print the citation provided in the metadata table."""
    print(_get_printable_citation(self.metadata))

set_dataframe_library(library)

This function sets the dataframe library to use when returning data.

Parameters:

Name Type Description Default
library Literal['pd', 'pl']

The dataframe library to use.

required

Raises:

Type Description
ValueError

If the library is not 'pd' or 'pl'.

Examples:


Set the dataframe library to polars.

>>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
>>> dataset.set_dataframe_library("pl")
Source code in bladesight/dataset_handler.py
def set_dataframe_library(self, library: Literal["pd", "pl"]):
    """This function sets the dataframe library to 
    use when returning data.

    Args:
        library (Literal['pd', 'pl']): The dataframe library to use.

    Raises:
        ValueError: If the library is not 'pd' or 'pl'.

    Examples:
    ---------
        Set the dataframe library to polars.

        >>> dataset = Dataset("bladesight-data/intro_to_btt/intro_to_btt_ch02.db")
        >>> dataset.set_dataframe_library("pl")
    """
    if library in ["pd", "pl"]:
        self.dataframe_library = library
    else:
        raise ValueError("library must be 'pd' or 'pl'")    

download_dataset_from_bladesight_data(dataset_path_on_s3)

This function downloads a dataset from S3 and saves it locally.

Parameters:

Name Type Description Default
dataset_path_on_s3 str

The path to the dataset on S3.

required

Examples:


Download a dataset into the local .bladesight directory

>>> download_dataset_from_bladesight_data("bladesight-datasets/intro_to_btt/intro_to_btt_ch02")
Source code in bladesight/dataset_handler.py
def download_dataset_from_bladesight_data(dataset_path_on_s3: str) -> None:
    """This function downloads a dataset from S3 and saves it locally.

    Args:
        dataset_path_on_s3 (str): The path to the dataset on S3.

    Examples:
    ---------
        Download a dataset into the local .bladesight directory

        >>> download_dataset_from_bladesight_data("bladesight-datasets/intro_to_btt/intro_to_btt_ch02")
    """
    s3 = s3fs.S3FileSystem(anon=True)
    PATH_TO_LOCAL_DB = get_path_to_local_bladesight() / "data"
    for s3_subfolder in dataset_path_on_s3.split("/")[1:]:
        PATH_TO_LOCAL_DB = PATH_TO_LOCAL_DB / s3_subfolder

    if not PATH_TO_LOCAL_DB.parent.exists():
        PATH_TO_LOCAL_DB.parent.mkdir(parents=True)

    with yaspin(
        text=f"Downloading {dataset_path_on_s3} from Bladesight Data..."
    ) as spinner:
        s3.download(dataset_path_on_s3 + ".db", str(PATH_TO_LOCAL_DB) + ".db")
    spinner.text = f"Done downloading {dataset_path_on_s3} from Bladesight Data... "
    spinner.ok("✅ ")

get_bladesight_datasets()

This function returns a list of all the datasets in the bladesight-datasets bucket.

Returns:

Type Description
List[str]

List[str]: A list of the names of the datasets in the bucket.

Source code in bladesight/dataset_handler.py
def get_bladesight_datasets() -> List[str]:
    """This function returns a list of all the datasets in
        the bladesight-datasets bucket.

    Returns:
        List[str]: A list of the names of the datasets in the bucket.
    """
    s3 = s3fs.S3FileSystem(anon=True)
    datasets = []
    with yaspin(text="Getting all datasets from Bladesight Data..."):
        for bucket_root, _, files in s3.walk(BLADESIGHT_DATASETS_S3_BUCKET + "/"):
            for file in files:
                if file.endswith(".db"):
                    datasets.append(f"{bucket_root}/{file}"[:-3])
    return datasets

get_local_datasets()

This function returns a list of the names of the datasets in the local datasets folder.

Returns:

Type Description
List[str]

List[str]: A list of the dataset names in the in the local datasets folder.

Source code in bladesight/dataset_handler.py
def get_local_datasets() -> List[str]:
    """This function returns a list of the names of the datasets in the local
    datasets folder.

    Returns:
        List[str]: A list of the dataset names in the in the 
            local datasets folder.
    """
    BLADESIGHT_DATASETS_PATH = get_path_to_local_bladesight()
    if not BLADESIGHT_DATASETS_PATH.exists():
        return []
    else:
        local_datasets = []
        for path_root, _, files in os.walk(BLADESIGHT_DATASETS_PATH):
            for file in files:
                if file.endswith(".db"):
                    path_parts = pathlib.Path(path_root).parts
                    path_prefix = None
                    add_parts = False
                    for part in path_parts:
                        if add_parts:
                            if path_prefix is None:
                                path_prefix = part
                            else:
                                path_prefix = path_prefix + "/" + part
                        if part == ".bladesight":
                            add_parts = True
                    local_datasets.append(f"{path_prefix}/{file}"[:-3])
        return local_datasets

get_path_to_local_bladesight()

This function returns the path to the local datasets folder. If there is no environmental variable called BLADESIGHT_DATASETS_PATH, it will return ~/.bladesight.

Returns:

Type Description
Path

pathlib.Path: The path to the local datasets folder. It does not necessarily exist.

Source code in bladesight/dataset_handler.py
def get_path_to_local_bladesight() -> pathlib.Path:
    """This function returns the path to the local datasets folder.
    If there is no environmental variable called BLADESIGHT_DATASETS_PATH, it
    will return ~/.bladesight.

    Returns:
        pathlib.Path: The path to the local datasets folder. It does
            not necessarily exist.
    """
    if "BLADESIGHT_DATASETS_PATH" in os.environ:
        return pathlib.Path(os.environ["BLADESIGHT_DATASETS_PATH"]) / ".bladesight"
    else:
        return pathlib.Path.home() / ".bladesight"