Source code for sdssdb.utils.ingest

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# @Author: José Sánchez-Gallego (gallegoj@uw.edu)
# @Date: 2019-09-21
# @Filename: ingest.py
# @License: BSD 3-clause (http://www.opensource.org/licenses/BSD-3-Clause)

import functools
import io
import multiprocessing
import os
import re
import warnings

import numpy
import peewee
from playhouse.postgres_ext import ArrayField
from playhouse.reflection import generate_models
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import DeferredReflection, declarative_base

from sdssdb import log
from sdssdb.connection import SQLADatabaseConnection
from sdssdb.sqlalchemy import BaseModel


try:
    import progressbar
except ImportError:
    progressbar = False

try:
    import inflect
except ImportError:
    inflect = None


__all__ = ('to_csv', 'copy_data', 'drop_table', 'create_model_from_table',
           'bulk_insert', 'file_to_db', 'create_adhoc_database')


DTYPE_TO_FIELD = {
    'i2': peewee.SmallIntegerField,
    'i4': peewee.IntegerField,
    'i8': peewee.BigIntegerField,
    'f4': peewee.FloatField,
    'f8': peewee.DoubleField,
    'S([0-9]+)': peewee.CharField
}



[docs]
def to_csv(table, path, header=True, delimiter='\t',
           use_multiprocessing=False, workers=4):
    """Creates a PostgreSQL-valid CSV file from a table, handling arrays.

    Parameters
    ----------
    table : astropy.table.Table
        The table to convert.
    path : str
        The path to which to write the CSV file.
    header : bool
        Whether to add a header with the column names.
    delimiter : str
        The delimiter between columns in the CSV files.
    use_multiprocessing : bool
        Whether to use multiple cores. The rows of the resulting file will not
        have the same ordering as the original table.
    workers : int
        How many workers to use with multiprocessing.

    """

    if use_multiprocessing:
        pool = multiprocessing.Pool(workers)
        tmp_list = pool.map(functools.partial(convert_row_to_psql,
                                              delimiter=delimiter),
                            table, chunksize=1000)
    else:
        tmp_list = [convert_row_to_psql(row, delimiter=delimiter) for row in table]

    csv_str = '\n'.join(tmp_list)

    if header:
        csv_str = delimiter.join(table.colnames) + '\n' + csv_str

    unit = open(path, 'w')
    unit.write(csv_str)



def table_exists(table_name, connection, schema=None):
    """Returns `True` if a table exists in a database.

    Parameters
    ----------
    table_name : str
        The name of the table.
    connection : .PeeweeDatabaseConnection
        The Peewee database connection to use.
    schema : str
        The schema in which the table lives.

    """

    return connection.table_exists(table_name, schema=schema)



[docs]
def drop_table(table_name, connection, cascade=False, schema=None):
    """Drops a table. Does nothing if the table does not exist.

    Parameters
    ----------
    table_name : str
        The name of the table to be dropped.
    connection : .PeeweeDatabaseConnection
        The Peewee database connection to use.
    cascade : bool
        Whether to drop related tables using ``CASCADE``.
    schema : str
        The schema in which the table lives.

    Returns
    -------
    result : bool
        Returns `True` if the table was correctly dropped or `False` if the
        table does not exists and nothing was done.

    """

    if not table_exists(table_name, connection, schema=schema):
        return False

    connection.execute_sql(f'DROP TABLE {schema}.{table_name}' +
                           (' CASCADE;' if cascade else ';'))

    return True




[docs]
def create_model_from_table(table_name, table, schema=None, lowercase=False,
                            primary_key=None):
    """Returns a `~peewee:Model` from the columns in a table.

    Parameters
    ----------
    table_name : str
        The name of the table.
    table : ~astropy.table.Table
        An astropy table whose column names and types will be used to create
        the model.
    schema : str
        The schema in which the table lives.
    lowercase : bool
        If `True`, all column names will be converted to lower case.
    primary_key : str
        The name of the column to mark as primary key.

    """

    # Prevents name confusion when setting schema in Meta.
    schema_ = schema

    attrs = {}

    class BaseModel(peewee.Model):
        class Meta:
            db_table = table_name
            schema = schema_
            primary_key = False

    for ii, column_name in enumerate(table.dtype.names):

        if lowercase:
            column_name = column_name.lower()

        column_dtype = table.dtype[ii]

        field_kwargs = {}
        if primary_key and primary_key == column_name:
            primary_key = True
        else:
            primary_key = False

        ColumnField = None
        type_found = False

        for dtype, Field in DTYPE_TO_FIELD.items():
            match = re.match(dtype, column_dtype.base.str[1:])
            if match:
                if column_dtype.base.str[1] == 'S':
                    field_kwargs['max_length'] = int(match.group(1))
                    ColumnField = Field
                else:
                    ColumnField = Field

                if len(column_dtype.shape) == 1:
                    ColumnField = ArrayField(ColumnField, field_kwargs=field_kwargs,
                                             dimensions=column_dtype.shape[0],
                                             null=True, primary_key=primary_key)
                elif len(column_dtype.shape) > 1:
                    raise ValueError(f'column {column_name} with dtype '
                                     f'{column_dtype}: multidimensional arrays '
                                     'are not supported.')
                else:
                    ColumnField = ColumnField(**field_kwargs, null=True,
                                              primary_key=primary_key)

                type_found = True
                break

        if not type_found:
            raise ValueError(f'cannot find an appropriate field type for '
                             f'column {column_name} with dtype {column_dtype}.')

        attrs[column_name] = ColumnField

    return type(str(table_name), (BaseModel,), attrs)



def convert_row_to_psql(row, delimiter='\t', null='\\N'):
    """Concerts an astropy table row to a Postgresql-valid CSV string."""

    row_data = []

    for col_value in row:
        if numpy.isscalar(col_value):
            row_data.append(str(col_value))
        elif numpy.ma.is_masked(col_value):
            row_data.append(null)
        else:
            if col_value.dtype.base.str[1] == 'S':
                col_value = col_value.astype('U')
            row_data.append(
                str(col_value.tolist())
                .replace('\n', '')
                .replace('\'', '\"')
                .replace('[', '\"{').replace(']', '}\"'))

    return delimiter.join(row_data)



[docs]
def copy_data(data, connection, table_name, schema=None, chunk_size=10000,
              show_progress=False):
    """Loads data into a DB table using ``COPY``.

    Parameters
    ----------
    data : ~astropy.table.Table
        An astropy table whose column names and types will be used to create
        the model.
    connection : .PeeweeDatabaseConnection
        The Peewee database connection to use.
    table_name : str
        The name of the table.
    schema : str
        The schema in which the table lives.
    chunk_size : int
        How many rows to load at once.
    show_progress : bool
        If `True`, shows a progress bar. Requires the
        `progressbar2 <https://progressbar-2.readthedocs.io/en/latest/>`__
        module to be installed.

    """

    table_sql = '{0}.{1}'.format(schema, table_name) if schema else table_name

    cursor = connection.cursor()

    # If the progressbar package is installed, uses it to create a progress bar.
    if show_progress:
        if progressbar is None:
            warnings.warn('progressbar2 is not installed. '
                          'Will not show a progress bar.')
        else:
            bar = progressbar.ProgressBar()
            iterable = bar(range(len(data)))
    else:
        iterable = range(len(data))

    # TODO: it's probably more efficient to convert each column to string first
    # (by chunks) and then unzip them into a single string. That way we only
    # iterate over columns instead of over rows and columns.

    chunk = 0
    tmp_list = []
    for ii in iterable:

        row = data[ii]
        tmp_list.append(convert_row_to_psql(row))
        chunk += 1

        # If we have reached a chunk commit point, or this is the last item,
        # copy and commits to the database.
        last_item = ii == len(data) - 1
        if chunk == chunk_size or (last_item and len(tmp_list) > 0):
            ss = io.StringIO('\n'.join(tmp_list))
            cursor.copy_from(ss, table_sql)
            connection.commit()
            tmp_list = []
            chunk = 0

    cursor.close()

    return




[docs]
def bulk_insert(data, connection, model, chunk_size=100000, show_progress=False):
    """Loads data into a DB table using bulk insert.

    Parameters
    ----------
    data : ~astropy.table.Table
        An astropy table with the data to insert.
    connection : .PeeweeDatabaseConnection
        The Peewee database connection to use.
    model : ~peewee:Model
        The model representing the database table into which to insert
        the data.
    chunk_size : int
        How many rows to load at once.
    show_progress : bool
        If `True`, shows a progress bar. Requires the
        `progressbar2 <https://progressbar-2.readthedocs.io/en/latest/>`__
        module to be installed.

    """

    from . import adaptors  # noqa

    if show_progress:
        if progressbar is None:
            warnings.warn('progressbar2 is not installed. '
                          'Will not show a progress bar.')
        else:
            bar = progressbar.ProgressBar(max_value=len(data)).start()
    else:
        bar = None

    n_chunk = 0
    with connection.atomic():
        for batch in peewee.chunked(data, chunk_size):
            model.insert_many(batch).execute()
            if bar:
                n_chunk += chunk_size
                bar.update(n_chunk)

    return




[docs]
def file_to_db(input_, connection, table_name, schema=None, lowercase=False,
               create=False, drop=False, truncate=False, primary_key=None,
               load_data=True, use_copy=True, chunk_size=100000,
               show_progress=False):
    """Loads a table from a file to a database.

    Loads a file or a `~astropy.table.Table` object into a database. If
    ``create=True`` a new table will be created, with column types matching
    the table ones. All columns are initially defined as ``NULL``.

    By default, the data are loaded using the ``COPY`` method to optimise
    performance. This can be disabled if needed.

    Parameters
    ----------
    input_ : str or ~astropy.table.Table
        The path to a file that will be opened using
        `Table.read <astropy.table.Table.read>` or an astropy
        `~astropy.table.Table`.
    connection : .PeeweeDatabaseConnection
        The Peewee database connection to use (SQLAlchemy connections are
        not supported).
    table_name : str
        The name of the table where to load the data, or to be created.
    schema : str
        The schema in which the table lives.
    lowercase : bool
        If `True`, all column names will be converted to lower case.
    create : bool
        Creates the table if it does not exist.
    drop : bool
        Drops the table before recreating it. Implies ``create=True``. Note
        that a ``CASCADE`` drop will be executed. Use with caution.
    truncate : bool
        Truncates the table before loading the data but maintains the existing
        columns.
    primary_key : str
        The name of the column to mark as primary key (ignored if the table
        is not being created).
    load_data : bool
        If `True`, loads the data from the table; otherwise just creates the
        table in the database.
    use_copy : bool
        When `True` (recommended) uses the SQL ``COPY`` command to load the data
        from a CSV stream.
    chunk_size : int
        How many rows to load at once.
    show_progress : bool
        If `True`, shows a progress bar. Requires the ``progressbar2`` module
        to be installed.

    Returns
    -------
    model : ~peewee:Model
        The model for the table created.

    """

    import astropy.table

    # If we drop we need to re-create but there is no need to truncate.
    if drop:
        create = True
        truncate = False

    if isinstance(input_, str) and os.path.isfile(input_):
        table = astropy.table.Table.read(input_)
    else:
        assert isinstance(input_, astropy.table.Table)
        table = input_

    if drop:
        drop_table(table_name, connection, schema=schema)

    if table_exists(table_name, connection, schema=schema):
        Model = generate_models(connection, schema=schema,
                                table_names=[table_name])[table_name]
    else:
        if not create:
            raise ValueError(f'table {table_name} does not exist. '
                             'Call the function with create=True '
                             'if you want to create it.')

        Model = create_model_from_table(table_name, table, schema=schema,
                                        lowercase=lowercase,
                                        primary_key=primary_key)
        Model._meta.database = connection

        Model.create_table()

    if truncate:
        Model.truncate_table()

    if load_data:
        if use_copy:
            copy_data(table, connection, table_name, schema=schema,
                      chunk_size=chunk_size, show_progress=show_progress)
        else:
            bulk_insert(table, connection, Model, chunk_size=chunk_size,
                        show_progress=show_progress)

    return Model




[docs]
def create_adhoc_database(dbname, schema=None, profile='local'):
    """ Creates an adhoc SQLA database and models, given an existing db

    Creates an in-memory SQLA database connection given a database name
    to connect to, along with auto-generated models for the a given schema
    name.  Currently limited to building models for one schema at a time.
    Useful for temporarily creating and trying a database connection, and
    simple models, without building and committing a full fledged new database
    connection.

    Parameters
    ----------
    dbname : str
        The name of the database to create a connection for
    schema : str
        The name of the schema to create mappings for
    profile : str
        The database profile to connect with

    Returns
    -------
    tuple
        A temporary database connection and module of model classes

    Example
    -------
    >>> from sdssdb.utils.ingest import create_adhoc_database
    >>> tempdb, models = create_adhoc_database('datamodel', schema='filespec')
    >>> tempdb
    >>> <DatamodelDatabaseConnection (dbname='datamodel', profile='local', connected=True)>
    >>> models.File
    >>> sqlalchemy.ext.automap.File

    """

    # create the database
    dbclass = f'{dbname.title()}DatabaseConnection'
    base = declarative_base(cls=(DeferredReflection, BaseModel,))
    tempdb_class = type(dbclass, (SQLADatabaseConnection,),
                        {'dbname': dbname, 'base': automap_base(base)})
    tempdb = tempdb_class(profile=profile, autoconnect=True)

    if tempdb.connected is False:
        log.warning(f'Could not connect to database: {dbname}. '
                    'Please check that the database exists. Cannot automap models.')
        return tempdb, None

    # automap the models
    tempdb.base.prepare(tempdb.engine, reflect=True, schema=schema,
                        classname_for_table=camelize_classname,
                        name_for_collection_relationship=pluralize_collection)
    models = tempdb.base.classes
    return tempdb, models



def camelize_classname(base, tablename, table):
    """ Produce a 'camelized' class name, e.g.

    Converts a database table name to camelcase. Uses underscores to denote a
    new hump. E.g. 'words_and_underscores' -> 'WordsAndUnderscores'
    see https://docs.sqlalchemy.org/en/13/orm/extensions/automap.html#overriding-naming-schemes

    Parameters
    ----------
    base : ~sqlalchemy.ext.automap.AutomapBase
        The AutomapBase class doing the prepare.
    tablenname : str
        The string name of the Table
    table : ~sqlalchemy.schema.Table
        The Table object itself

    Returns
    -------
    str
        A string class name

    """
    return str(tablename[0].upper() + re.sub(r'_([a-z])',
                                             lambda m: m.group(1).upper(),
                                             tablename[1:]))


def pluralize_collection(base, local_cls, referred_cls, constraint):
    """ Produce an 'uncamelized', 'pluralized' class name

    Converts a camel-cased class name into a uncamelized, pluralized class
    name, e.g. ``'SomeTerm' -> 'some_terms'``. Used when auto-defining
    relationship names.
    See https://docs.sqlalchemy.org/en/13/orm/extensions/automap.html#overriding-naming-schemes.

    Parameters
    ----------
    base : ~sqlalchemy.ext.automap.AutomapBase
        The AutomapBase class doing the prepare.
    local_cls : object
        The class to be mapped on the local side.
    referred_cls : object
        The class to be mapped on the referring side.
    constraint : ~sqlalchemy.schema.ForeignKeyConstraint
        The ForeignKeyConstraint that is being inspected to produce
        this relationship.

    Returns
    -------
    str
        An uncamelized, pluralized string class name

    """

    assert inflect, 'pluralize_collection requires the inflect library.'

    referred_name = referred_cls.__name__
    uncamelized = re.sub(r'[A-Z]',
                         lambda m: "_%s" % m.group(0).lower(),
                         referred_name)[1:]
    _pluralizer = inflect.engine()
    pluralized = _pluralizer.plural(uncamelized)
    return pluralized