ckanapi_harvesters.harvesters package

Subpackages

Submodules

ckanapi_harvesters.harvesters.harvester_abc module

Harvester base class

class ckanapi_harvesters.harvesters.harvester_abc.DatabaseHarvesterABC(params: DatabaseParams = None)

Bases: HarvesterConnectABC, ABC

clear_secrets()
abstractmethod copy(*, dest=None)
abstractmethod get_dataset_harvester(dataset_name: str) DatasetHarvesterABC
abstractmethod get_description() str
abstractmethod get_login_url_without_auth() str
abstractmethod static init_from_options_string(options_string: str, *, base_dir: str = None) Tuple[DatabaseHarvesterABC, List[str]]
abstractmethod list_datasets(return_metadata: bool = True) List[str] | OrderedDict[str, DatasetMetadata]
update_from_ckan(ckan)
class ckanapi_harvesters.harvesters.harvester_abc.DatasetHarvesterABC(params: DatasetParams = None)

Bases: DatabaseHarvesterABC, ABC

clean_dataset_metadata() DatasetMetadata
abstractmethod copy(*, dest=None)
abstractmethod get_table_harvester(table_name: str) TableHarvesterABC
abstractmethod static init_from_options_string(options_string: str, *, base_dir: str = None) Tuple[DatasetHarvesterABC, List[str]]
abstractmethod list_tables(return_metadata: bool = True) List[str] | OrderedDict[str, TableMetadata]
abstractmethod query_dataset_metadata(cancel_if_present: bool = True) DatasetMetadata
class ckanapi_harvesters.harvesters.harvester_abc.HarvesterConnectABC

Bases: ABC

abstractmethod check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
abstractmethod clear_secrets()
abstractmethod connect(*, cancel_if_connected: bool = True) Any
abstractmethod disconnect() None
abstractmethod is_connected() bool
abstractmethod update_from_ckan(ckan)
class ckanapi_harvesters.harvesters.harvester_abc.TableHarvesterABC(params: TableParams = None)

Bases: DatasetHarvesterABC, ABC

clean_table_metadata() TableMetadata
abstractmethod copy(*, dest=None)
get_default_data_cleaner() CkanDataCleanerABC | None
classmethod get_default_df_upload_fun() Callable[[Any], DataFrame] | None
abstractmethod get_default_primary_key() List[str]
abstractmethod static init_from_options_string(options_string: str, *, base_dir: str = None, file_url_attr: str = None) Tuple[TableHarvesterABC, List[str]]
abstractmethod list_queries(*, new_connection: bool = False) List[Tuple[Any, int]]
abstractmethod query_data(query: Any) List[dict] | DataFrame
abstractmethod query_table_metadata(cancel_if_present: bool = True) TableMetadata

ckanapi_harvesters.harvesters.harvester_errors module

Errors specific to harvesting data

exception ckanapi_harvesters.harvesters.harvester_errors.HarvestMethodRequiredError

Bases: Exception

exception ckanapi_harvesters.harvesters.harvester_errors.HarvesterArgumentError

Bases: Exception

exception ckanapi_harvesters.harvesters.harvester_errors.HarvesterArgumentRequiredError(argument: str, harvest_method: str, help: str = None)

Bases: HarvesterArgumentError

exception ckanapi_harvesters.harvesters.harvester_errors.HarvesterRequirementError(requirement: str, harvest_method: str)

Bases: RequirementError

exception ckanapi_harvesters.harvesters.harvester_errors.ResourceNotFoundError(resource_type: str, table_name: str, host: str)

Bases: Exception

ckanapi_harvesters.harvesters.harvester_init module

Harvester initialization from the options_string arguments

ckanapi_harvesters.harvesters.harvester_init.init_dataset_harvester_from_options_string(options_string: str, *, base_dir: str = None) Tuple[DatasetHarvesterABC, List[str]]
ckanapi_harvesters.harvesters.harvester_init.init_table_harvester_from_options_string(options_string: str, *, file_url_attr: str, base_dir: str = None) Tuple[TableHarvesterABC, List[str]]

ckanapi_harvesters.harvesters.harvester_model module

Harvester base class

class ckanapi_harvesters.harvesters.harvester_model.DatasetMetadata

Bases: object

copy()
class ckanapi_harvesters.harvesters.harvester_model.FieldMetadata

Bases: object

copy()
class ckanapi_harvesters.harvesters.harvester_model.TableMetadata

Bases: object

copy()

ckanapi_harvesters.harvesters.harvester_params module

Harvester parameters. The base names of the parameters are shared between harvesters.

class ckanapi_harvesters.harvesters.harvester_params.DatabaseParams(source: DatabaseParams = None)

Bases: object

Class representing parameters to connect to a database. This class mangages the connection parameters such as proxy and CA. It also manages authentication parameters.

abstractmethod copy(*, dest=None)
initialize_from_cli_args(args: Namespace, base_dir: str = None, error_not_found: bool = True, default_proxies: dict = None, proxy_headers: dict = None) None
static parse_harvest_method(options_string: str) str
parse_options_string(options_string: str, *, base_dir: str = None, file_url_attr: str = None, parser: ArgumentParser = None) List[str]
print_help_cli(display: bool = True) str
property proxies: dict
property proxy_auth: AuthBase | Tuple[str, str]
property proxy_string: str
set_verify_server_ca(ca_cert: bool | str | None, enforce_ca_safety: bool = None) None
static setup_cli_harvester_parser(parser: ArgumentParser = None) ArgumentParser
static unlock_external_url_resource_download(value: bool = True)

This function enables the download of resources external from the CKAN server.

static unlock_no_server_ca(value: bool = True)

This function enables you to disable the CA verification of the CKAN server.

__Warning__: Only allow in a local environment!

property verify_server_ca: bool | str | None
class ckanapi_harvesters.harvesters.harvester_params.DatasetParams(source: DatasetParams = None)

Bases: DatabaseParams

copy(*, dest=None)
initialize_from_cli_args(args: Namespace, base_dir: str = None, error_not_found: bool = True, default_proxies: dict = None, proxy_headers: dict = None) None
static setup_cli_harvester_parser(parser: ArgumentParser = None) ArgumentParser
class ckanapi_harvesters.harvesters.harvester_params.TableParams(source: TableParams = None)

Bases: DatasetParams

copy(*, dest=None)
initialize_from_cli_args(args: Namespace, base_dir: str = None, error_not_found: bool = True, default_proxies: dict = None, proxy_headers: dict = None) None
parse_options_string(options_string: str, *, base_dir: str = None, file_url_attr: str = None, parser: ArgumentParser = None) List[str]
static setup_cli_harvester_parser(parser: ArgumentParser = None) ArgumentParser

ckanapi_harvesters.harvesters.mongodb_data_cleaner module

Harvest from a MongoDB database using pymongo (data cleaner)

exception ckanapi_harvesters.harvesters.mongodb_data_cleaner.BrokenMongoRefError

Bases: Exception

class ckanapi_harvesters.harvesters.mongodb_data_cleaner.MongoDataCleanerUpload

Bases: CkanDataCleanerUploadGeom

Data cleaner operations specific to MongoDB objects.

clear_all_outputs()

Some values must not be cleared for each DataFrame upload. The cleaner is stateful for certain values cleared only here.

clear_outputs_new_dataframe()
copy(dest=None) MongoDataCleanerUpload
static get_class_keyword() str

Returns the name of the class, according to data_cleaner_dict defined in data_cleaner_init.py. This name is used to setup the data cleaner for a resource builder.

ckanapi_harvesters.harvesters.mongodb_data_cleaner.mongo_default_data_cleaner() MongoDataCleanerUpload
ckanapi_harvesters.harvesters.mongodb_data_cleaner.mongo_default_df_conversion(documents: List[dict], **kwargs) DataFrame | ListRecords

ckanapi_harvesters.harvesters.mongodb_harvester module

Harvest from a MongoDB database using pymongo

class ckanapi_harvesters.harvesters.mongodb_harvester.DatabaseHarvesterMongoServer(params: DatabaseParams = None)

Bases: DatabaseHarvesterABC

This class manages the connection to a MongoDB server. It can list datasets (MongoDB databases) but this call could lead to an error.

check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
connect(*, cancel_if_connected: bool = True) Any
copy(*, dest=None)
disconnect() None
get_dataset_harvester(dataset_name: str) DatasetHarvesterMongoDatabase
get_description() str
get_login_url_without_auth() str
static init_from_options_string(options_string: str, base_dir: str = None) Tuple[DatabaseHarvesterMongoServer, List[str]]
is_connected() bool
list_datasets(return_metadata: bool = True) List[str] | OrderedDict[str, DatasetMetadata]
class ckanapi_harvesters.harvesters.mongodb_harvester.DatasetHarvesterMongoDatabase(params: DatasetParams = None)

Bases: DatabaseHarvesterMongoServer, DatasetHarvesterABC

A CKAN dataset corresponds to a MongoDB database (set of collections).

check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
clean_dataset_metadata() DatasetMetadata
connect(*, cancel_if_connected: bool = True) Any
disconnect() None
get_description() str
get_table_harvester(table_name: str) TableHarvesterMongoCollection
static init_from_options_string(options_string: str, base_dir: str = None) Tuple[DatasetHarvesterMongoDatabase, List[str]]
is_connected() bool
list_tables(return_metadata: bool = True) List[str] | OrderedDict[str, TableMetadata]
query_dataset_metadata(cancel_if_present: bool = True) DatasetMetadata
class ckanapi_harvesters.harvesters.mongodb_harvester.TableHarvesterMongoCollection(params: TableParamsMongoCollection = None)

Bases: DatasetHarvesterMongoDatabase, TableHarvesterABC

A table (CKAN DataStore) corresponds to a MongoDB collection.

check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
clean_table_metadata() TableMetadata
connect(*, cancel_if_connected: bool = True) Any
copy(*, dest=None)
disconnect() None
get_default_data_cleaner() CkanDataCleanerABC | None
get_default_primary_key() List[str]
get_description() str
static init_from_options_string(options_string: str, *, base_dir: str = None, file_url_attr: str = None) Tuple[TableHarvesterMongoCollection, List[str]]
list_queries(*, new_connection: bool = False) List[Tuple[dict, int]]
query_data(query: Dict[str, Any]) List[dict]
query_table_metadata(cancel_if_present: bool = True) TableMetadata

ckanapi_harvesters.harvesters.mongodb_params module

Harvest from a MongoDB database using pymongo (parameters)

class ckanapi_harvesters.harvesters.mongodb_params.TableParamsMongoCollection(source: TableParamsMongoCollection = None)

Bases: TableParams

A table (CKAN DataStore) corresponds to a MongoDB collection. This subclass of TableParams implements an alias attribute for table name called collection.

property collection: str
copy(*, dest=None)
initialize_from_cli_args(args: Namespace, base_dir: str = None, error_not_found: bool = True, default_proxies: dict = None, proxy_headers: dict = None) None
static setup_cli_harvester_parser(parser: ArgumentParser = None) ArgumentParser

ckanapi_harvesters.harvesters.postgre_harvester module

Harvest from a PostgreSQL database using sqlalchemy

class ckanapi_harvesters.harvesters.postgre_harvester.DatabaseHarvesterPostgre(params: DatabaseParams = None)

Bases: DatabaseHarvesterABC

This class manages the connection to a PostgreSQL database server. It can list schemas (corresponding to CKAN datasets).

check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
connect(*, cancel_if_connected: bool = True) Any
copy(*, dest=None)
disconnect() None
get_dataset_harvester(dataset_name: str) DatasetHarvesterPostgre
get_description() str
get_login_url_without_auth() str
static init_from_options_string(options_string: str, base_dir: str = None) Tuple[DatabaseHarvesterPostgre, List[str]]
is_connected() bool
list_datasets(return_metadata: bool = True) List[str] | OrderedDict[str, DatasetMetadata]
class ckanapi_harvesters.harvesters.postgre_harvester.DatasetHarvesterPostgre(params: DatasetParamsPostgreSchema = None)

Bases: DatabaseHarvesterPostgre, DatasetHarvesterABC

A CKAN dataset corresponds to a PostgreSQL schema (set of tables).

check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
clean_dataset_metadata() DatasetMetadata
connect(*, cancel_if_connected: bool = True) Any
disconnect() None
get_description() str
get_table_harvester(table_name: str) TableHarvesterPostgre
static init_from_options_string(options_string: str, base_dir: str = None) Tuple[DatasetHarvesterPostgre, List[str]]
is_connected() bool
list_tables(return_metadata: bool = True) List[str] | OrderedDict[str, TableMetadata]
query_dataset_metadata(cancel_if_present: bool = True) DatasetMetadata
class ckanapi_harvesters.harvesters.postgre_harvester.TableHarvesterPostgre(params: TableParamsPostgre = None)

Bases: DatasetHarvesterPostgre, TableHarvesterABC

A CKAN table (DataStore) corresponds to a PostgreSQL table.

_data_type_map_to_ckan(field_metadata: FieldMetadata) None

Some data types need to be translated

_get_field_query_function(field_metadata: FieldMetadata) str

Force some data types to return as text

check_connection(*, new_connection: bool = False, raise_error: bool = False) None | ContextErrorLevelMessage
clean_table_metadata() TableMetadata
connect(*, cancel_if_connected: bool = True) Any
copy(*, dest=None)
get_default_data_cleaner() CkanDataCleanerABC
get_default_primary_key() List[str]
get_description() str
static init_from_options_string(options_string: str, *, base_dir: str = None, file_url_attr: str = None) Tuple[TableHarvesterPostgre, List[str]]
list_queries(*, new_connection: bool = False) List[Tuple[str, int]]
query_data(query: Dict[str, Any]) DataFrame
query_table_metadata(cancel_if_present: bool = True) TableMetadata
update_from_ckan(ckan)

ckanapi_harvesters.harvesters.postgre_params module

Harvest from a PostgreSQL database

class ckanapi_harvesters.harvesters.postgre_params.DatasetParamsPostgreSchema(source: DatasetParamsPostgreSchema = None)

Bases: DatasetParams

A CKAN dataset corresponds to a PostgreSQL schema (set of tables). This subclass of DatasetParams implements an alias attribute for dataset name called schema.

copy(*, dest=None)
initialize_from_cli_args(args: Namespace, base_dir: str = None, error_not_found: bool = True, default_proxies: dict = None, proxy_headers: dict = None) None
property schema: str
static setup_cli_harvester_parser(parser: ArgumentParser = None) ArgumentParser
class ckanapi_harvesters.harvesters.postgre_params.TableParamsPostgre(source: TableParamsPostgre = None)

Bases: TableParams

copy(*, dest=None)
initialize_from_cli_args(args: Namespace, base_dir: str = None, error_not_found: bool = True, default_proxies: dict = None, proxy_headers: dict = None) None
property schema: str
static setup_cli_harvester_parser(parser: ArgumentParser = None) ArgumentParser

ckanapi_harvesters.harvesters.pymongo_harvester module

Deprecated module name alias for mongodb_harvester

Module contents

Section of the package dedicated to the harvesting of data using APIs, or databases