# -*- coding: utf-8 -*-
"""H5/NETCDF file collection."""
import glob
import logging
import os
import time
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
from warnings import warn

import numpy as np
import pandas as pd
import psutil
import xarray as xr
from gaps import Status
from rex.utilities.fun_utils import get_fun_call_str
from rex.utilities.loggers import init_logger
from scipy.spatial import KDTree

from sup3r.postprocessing.file_handling import OutputMixIn, RexOutputs
from sup3r.utilities import ModuleName
from sup3r.utilities.cli import BaseCLI

logger = logging.getLogger(__name__)


class BaseCollector(OutputMixIn, ABC):
    """Base collector class for H5/NETCDF collection"""

    def __init__(self, file_paths):
        """Parameters
        ----------
        file_paths : list | str
            Explicit list of str file paths that will be sorted and collected
            or a single string with unix-style /search/patt*ern.<ext>. Files
            should have non-overlapping time_index and spatial domains.
        """
        if not isinstance(file_paths, list):
            file_paths = glob.glob(file_paths)
        self.file_paths = file_paths
        self.flist = sorted(file_paths)
        self.data = None
        self.file_attrs = {}

    @classmethod
    @abstractmethod
    def collect(cls, *args, **kwargs):
        """Collect data files from a dir to one output file."""

    @classmethod
    def get_node_cmd(cls, config):
        """Get a CLI call to collect data.

        Parameters
        ----------
        config : dict
            sup3r collection config with all necessary args and kwargs to
            run data collection.
        """
        import_str = (
            'from sup3r.postprocessing.collection '
            f'import {cls.__name__};\n'
            'from rex import init_logger;\n'
            'import time;\n'
            'from gaps import Status;\n'
        )

        dc_fun_str = get_fun_call_str(cls.collect, config)

        log_file = config.get('log_file', None)
        log_level = config.get('log_level', 'INFO')
        log_arg_str = f'"sup3r", log_level="{log_level}"'
        if log_file is not None:
            log_arg_str += f', log_file="{log_file}"'

        cmd = (
            f"python -c \'{import_str}\n"
            "t0 = time.time();\n"
            f"logger = init_logger({log_arg_str});\n"
            f"{dc_fun_str};\n"
            "t_elap = time.time() - t0;\n"
        )

        pipeline_step = config.get('pipeline_step') or ModuleName.DATA_COLLECT
        cmd = BaseCLI.add_status_cmd(config, pipeline_step, cmd)
        cmd += ";\'\n"

        return cmd.replace('\\', '/')


class CollectorNC(BaseCollector):
    """Sup3r NETCDF file collection framework"""

    @classmethod
    def collect(
        cls,
        file_paths,
        out_file,
        features,
        log_level=None,
        log_file=None,
        write_status=False,
        job_name=None,
        overwrite=True,
        res_kwargs=None
    ):
        """Collect data files from a dir to one output file.

        Filename requirements:
         - Should end with ".nc"

        Parameters
        ----------
        file_paths : list | str
            Explicit list of str file paths that will be sorted and collected
            or a single string with unix-style /search/patt*ern.nc.
        out_file : str
            File path of final output file.
        features : list
            List of dsets to collect
        log_level : str | None
            Desired log level, None will not initialize logging.
        log_file : str | None
            Target log file. None logs to stdout.
        write_status : bool
            Flag to write status file once complete if running from pipeline.
        job_name : str
            Job name for status file if running from pipeline.
        overwrite : bool
            Whether to overwrite existing output file
        res_kwargs : dict | None
            Dictionary of kwargs to pass to xarray.open_mfdataset.
        """
        t0 = time.time()

        logger.info(
            f'Initializing collection for file_paths={file_paths}'
        )

        if log_level is not None:
            init_logger(
                'sup3r.preprocessing', log_file=log_file, log_level=log_level
            )

        if not os.path.exists(os.path.dirname(out_file)):
            os.makedirs(os.path.dirname(out_file), exist_ok=True)

        collector = cls(file_paths)
        logger.info(
            'Collecting {} files to {}'.format(len(collector.flist), out_file)
        )
        if overwrite and os.path.exists(out_file):
            logger.info(f'overwrite=True, removing {out_file}.')
            os.remove(out_file)

        if not os.path.exists(out_file):
            res_kwargs = res_kwargs or {}
            out = xr.open_mfdataset(collector.flist, **res_kwargs)
            features = [feat for feat in out if feat in features
                        or feat.lower() in features]
            out[features].to_netcdf(out_file)

        if write_status and job_name is not None:
            status = {
                'out_dir': os.path.dirname(out_file),
                'fout': out_file,
                'flist': collector.flist,
                'job_status': 'successful',
                'runtime': (time.time() - t0) / 60,
            }
            Status.make_single_job_file(
                os.path.dirname(out_file), 'collect', job_name, status
            )

        logger.info('Finished file collection.')

    def group_spatial_chunks(self):
        """Group same spatial chunks together so each chunk has same spatial
        footprint but different times"""
        chunks = {}
        for file in self.flist:
            s_chunk = file.split('_')[0]
            dirname = os.path.dirname(file)
            s_file = os.path.join(dirname, f's_{s_chunk}.nc')
            chunks[s_file] = [*chunks.get(s_file, []), s_file]
        return chunks


class CollectorH5(BaseCollector):
    """Sup3r H5 file collection framework"""

    @classmethod
    def get_slices(
        cls, final_time_index, final_meta, new_time_index, new_meta
    ):
        """Get index slices where the new ti/meta belong in the final ti/meta.

        Parameters
        ----------
        final_time_index : pd.Datetimeindex
            Time index of the final file that new_time_index is being written
            to.
        final_meta : pd.DataFrame
            Meta data of the final file that new_meta is being written to.
        new_time_index : pd.Datetimeindex
            Chunk time index that is a subset of the final_time_index.
        new_meta : pd.DataFrame
            Chunk meta data that is a subset of the final_meta.

        Returns
        -------
        row_slice : slice
            final_time_index[row_slice] = new_time_index
        col_slice : slice
            final_meta[col_slice] = new_meta
        """
        final_index = final_meta.index
        new_index = new_meta.index
        row_loc = np.where(final_time_index.isin(new_time_index))[0]
        col_loc = np.where(final_meta['gid'].isin(new_meta['gid']))[0]

        if not len(row_loc) > 0:
            msg = (
                'Could not find row locations in file collection. '
                'New time index: {} final time index: {}'.format(
                    new_time_index, final_time_index
                )
            )
            logger.error(msg)
            raise RuntimeError(msg)

        if not len(col_loc) > 0:
            msg = (
                'Could not find col locations in file collection. '
                'New index: {} final index: {}'.format(new_index, final_index)
            )
            logger.error(msg)
            raise RuntimeError(msg)

        row_slice = slice(np.min(row_loc), np.max(row_loc) + 1)

        msg = (
            f'row_slice={row_slice} conflict with row_indices={row_loc}. '
            'Indices do not seem to be increasing and/or contiguous.'
        )
        assert (row_slice.stop - row_slice.start) == len(row_loc), msg

        return row_slice, col_loc

    def get_coordinate_indices(self, target_meta, full_meta, threshold=1e-4):
        """Get coordindate indices in meta data for given targets

        Parameters
        ----------
        target_meta : pd.DataFrame
            Dataframe of coordinates to find within the full meta
        full_meta : pd.DataFrame
            Dataframe of full set of coordinates for unfiltered dataset
        threshold : float
            Threshold distance for finding target coordinates within full meta
        """
        ll2 = np.vstack(
            (full_meta.latitude.values, full_meta.longitude.values)
        ).T
        tree = KDTree(ll2)
        targets = np.vstack(
            (target_meta.latitude.values, target_meta.longitude.values)
        ).T
        _, indices = tree.query(targets, distance_upper_bound=threshold)
        indices = indices[indices < len(full_meta)]
        return indices

    def get_data(
        self,
        file_path,
        feature,
        time_index,
        meta,
        scale_factor,
        dtype,
        threshold=1e-4,
    ):
        """Retreive a data array from a chunked file.

        Parameters
        ----------
        file_path : str
            h5 file to get data from
        feature : str
            dataset to retrieve data from fpath.
        time_index : pd.Datetimeindex
            Time index of the final file.
        meta : pd.DataFrame
            Meta data of the final file.
        scale_factor : int | float
            Final destination scale factor after collection. If the data
            retrieval from the files to be collected has a different scale
            factor, the collected data will be rescaled and returned as
            float32.
        dtype : np.dtype
            Final dtype to return data as
        threshold : float
            Threshold distance for finding target coordinates within full meta

        Returns
        -------
        f_data : np.ndarray
            Data array from the fpath cast as input dtype.
        row_slice : slice
            final_time_index[row_slice] = new_time_index
        col_slice : slice
            final_meta[col_slice] = new_meta
        """
        with RexOutputs(file_path, unscale=False, mode='r') as f:
            f_ti = f.time_index
            f_meta = f.meta
            source_scale_factor = f.attrs[feature].get('scale_factor', 1)

            if feature not in f.dsets:
                e = (
                    'Trying to collect dataset "{}" but cannot find in '
                    'available: {}'.format(feature, f.dsets)
                )
                logger.error(e)
                raise KeyError(e)

            mask = self.get_coordinate_indices(
                meta, f_meta, threshold=threshold
            )
            f_meta = f_meta.iloc[mask]
            f_data = f[feature][:, mask]

        if len(mask) == 0:
            msg = (
                'No target coordinates found in masked meta. '
                f'Skipping collection for {file_path}.'
            )
            logger.warning(msg)
            warn(msg)

        else:
            row_slice, col_slice = self.get_slices(
                time_index, meta, f_ti, f_meta
            )

            if scale_factor != source_scale_factor:
                f_data = f_data.astype(np.float32)
                f_data *= scale_factor / source_scale_factor

            if np.issubdtype(dtype, np.integer):
                f_data = np.round(f_data)

            f_data = f_data.astype(dtype)

            try:
                self.data[row_slice, col_slice] = f_data
            except Exception as e:
                msg = (f'Failed to add data to self.data[{row_slice}, '
                       f'{col_slice}] for feature={feature}, '
                       f'file_path={file_path}, time_index={time_index}, '
                       f'meta={meta}. {e}')
                logger.error(msg)
                raise OSError(msg) from e

    def _get_file_attrs(self, file):
        """Get meta data and time index for a single file"""
        if file in self.file_attrs:
            meta = self.file_attrs[file]['meta']
            time_index = self.file_attrs[file]['time_index']
        else:
            with RexOutputs(file, mode='r') as f:
                meta = f.meta
                time_index = f.time_index
        if file not in self.file_attrs:
            self.file_attrs[file] = {'meta': meta, 'time_index': time_index}
        return meta, time_index

    def _get_collection_attrs(
        self, file_paths, sort=True, sort_key=None, max_workers=None
    ):
        """Get important dataset attributes from a file list to be collected.

        Assumes the file list is chunked in time (row chunked).

        Parameters
        ----------
        file_paths : list | str
            Explicit list of str file paths that will be sorted and collected
            or a single string with unix-style /search/patt*ern.h5.
        sort : bool
            flag to sort flist to determine meta data order.
        sort_key : None | fun
            Optional sort key to sort flist by (determines how meta is built
            if out_file does not exist).
        max_workers : int | None
            Number of workers to use in parallel. 1 runs serial,
            None will use all available workers.
        target_final_meta_file : str
            Path to target final meta containing coordinates to keep from the
            full list of coordinates present in the collected meta for the full
            file list.
        threshold : float
            Threshold distance for finding target coordinates within full meta

        Returns
        -------
        time_index : pd.datetimeindex
            Concatenated full size datetime index from the flist that is
            being collected
        meta : pd.DataFrame
            Concatenated full size meta data from the flist that is being
            collected or provided target meta
        """
        if sort:
            file_paths = sorted(file_paths, key=sort_key)

        logger.info(
            'Getting collection attrs for full dataset with '
            f'max_workers={max_workers}.'
        )

        time_index = [None] * len(file_paths)
        meta = [None] * len(file_paths)
        if max_workers == 1:
            for i, fn in enumerate(file_paths):
                meta[i], time_index[i] = self._get_file_attrs(fn)
                logger.debug(f'{i+1} / {len(file_paths)} files finished')
        else:
            futures = {}
            with ThreadPoolExecutor(max_workers=max_workers) as exe:
                for i, fn in enumerate(file_paths):
                    future = exe.submit(self._get_file_attrs, fn)
                    futures[future] = i

                for i, future in enumerate(as_completed(futures)):
                    mem = psutil.virtual_memory()
                    msg = (
                        f'Meta collection futures completed: {i + 1} out '
                        f'of {len(futures)}. Current memory usage is '
                        f'{mem.used / 1e9:.3f} GB out of '
                        f'{mem.total / 1e9:.3f} GB total.'
                    )
                    logger.info(msg)
                    try:
                        idx = futures[future]
                        meta[idx], time_index[idx] = future.result()
                    except Exception as e:
                        msg = (
                            'Falied to get attrs from '
                            f'{file_paths[futures[future]]}'
                        )
                        logger.exception(msg)
                        raise RuntimeError(msg) from e
        time_index = pd.DatetimeIndex(np.concatenate(time_index))
        time_index = time_index.sort_values()
        time_index = time_index.drop_duplicates()
        meta = pd.concat(meta)

        if 'latitude' in meta and 'longitude' in meta:
            meta = meta.drop_duplicates(subset=['latitude', 'longitude'])
        meta = meta.sort_values('gid')

        return time_index, meta

    def get_target_and_masked_meta(
        self, meta, target_final_meta_file=None, threshold=1e-4
    ):
        """Use combined meta for all files and target_final_meta_file to get
        mapping from the full meta to the target meta and the mapping from the
        target meta to the full meta, both of which are masked to remove
        coordinates not present in the target_meta.

        Parameters
        ----------
        meta : pd.DataFrame
            Concatenated full size meta data from the flist that is being
            collected or provided target meta
        target_final_meta_file : str
            Path to target final meta containing coordinates to keep from the
            full list of coordinates present in the collected meta for the full
            file list.
        threshold : float
            Threshold distance for finding target coordinates within full meta

        Returns
        -------
        target_final_meta : pd.DataFrame
            Concatenated full size meta data from the flist that is being
            collected or provided target meta
        masked_meta : pd.DataFrame
            Concatenated full size meta data from the flist that is being
            collected masked against target_final_meta
        """
        if target_final_meta_file is not None and os.path.exists(
            target_final_meta_file
        ):
            target_final_meta = pd.read_csv(target_final_meta_file)
            if 'gid' in target_final_meta.columns:
                target_final_meta = target_final_meta.drop('gid', axis=1)
            mask = self.get_coordinate_indices(
                target_final_meta, meta, threshold=threshold
            )
            masked_meta = meta.iloc[mask]
            logger.info(f'Masked meta coordinates: {len(masked_meta)}')
            mask = self.get_coordinate_indices(
                masked_meta, target_final_meta, threshold=threshold
            )
            target_final_meta = target_final_meta.iloc[mask]
            logger.info(f'Target meta coordinates: {len(target_final_meta)}')
        else:
            target_final_meta = masked_meta = meta

        return target_final_meta, masked_meta

    def get_collection_attrs(
        self,
        file_paths,
        sort=True,
        sort_key=None,
        max_workers=None,
        target_final_meta_file=None,
        threshold=1e-4,
    ):
        """Get important dataset attributes from a file list to be collected.

        Assumes the file list is chunked in time (row chunked).

        Parameters
        ----------
        file_paths : list | str
            Explicit list of str file paths that will be sorted and collected
            or a single string with unix-style /search/patt*ern.h5.
        sort : bool
            flag to sort flist to determine meta data order.
        sort_key : None | fun
            Optional sort key to sort flist by (determines how meta is built
            if out_file does not exist).
        max_workers : int | None
            Number of workers to use in parallel. 1 runs serial,
            None will use all available workers.
        target_final_meta_file : str
            Path to target final meta containing coordinates to keep from the
            full list of coordinates present in the collected meta for the full
            file list.
        threshold : float
            Threshold distance for finding target coordinates within full meta

        Returns
        -------
        time_index : pd.datetimeindex
            Concatenated full size datetime index from the flist that is
            being collected
        target_final_meta : pd.DataFrame
            Concatenated full size meta data from the flist that is being
            collected or provided target meta
        masked_meta : pd.DataFrame
            Concatenated full size meta data from the flist that is being
            collected masked against target_final_meta
        shape : tuple
            Output (collected) dataset shape
        global_attrs : dict
            Global attributes from the first file in file_paths (it's assumed
            that all the files in file_paths have the same global file
            attributes).
        """
        logger.info(f'Using target_final_meta_file={target_final_meta_file}')
        if isinstance(target_final_meta_file, str):
            msg = (
                f'Provided target meta ({target_final_meta_file}) does not '
                'exist.'
            )
            assert os.path.exists(target_final_meta_file), msg

        time_index, meta = self._get_collection_attrs(
            file_paths, sort=sort, sort_key=sort_key, max_workers=max_workers
        )

        target_final_meta, masked_meta = self.get_target_and_masked_meta(
            meta, target_final_meta_file, threshold=threshold
        )

        shape = (len(time_index), len(target_final_meta))

        with RexOutputs(file_paths[0], mode='r') as fin:
            global_attrs = fin.global_attrs

        return time_index, target_final_meta, masked_meta, shape, global_attrs

    def _write_flist_data(
        self,
        out_file,
        feature,
        time_index,
        subset_masked_meta,
        target_masked_meta,
    ):
        """Write spatiotemporal file list data to output file for given
        feature

        Parameters
        ----------
        out_file : str
            Name of output file
        feature : str
            Name of feature for output chunk
        time_index : pd.DateTimeIndex
            Time index for corresponding file list data
        subset_masked_meta : pd.DataFrame
            Meta for corresponding file list data
        target_masked_meta : pd.DataFrame
            Meta for full output file
        """
        with RexOutputs(out_file, mode='r') as f:
            target_ti = f.time_index
            y_write_slice, x_write_slice = self.get_slices(
                target_ti,
                target_masked_meta,
                time_index,
                subset_masked_meta,
            )
        self._ensure_dset_in_output(out_file, feature)

        with RexOutputs(out_file, mode='a') as f:
            try:
                f[feature, y_write_slice, x_write_slice] = self.data
            except Exception as e:
                msg = (
                    f'Problem with writing data to {out_file} with '
                    f't_slice={y_write_slice}, '
                    f's_slice={x_write_slice}. {e}'
                )
                logger.error(msg)
                raise OSError(msg) from e

        logger.debug(
            'Finished writing "{}" for row {} and col {} to: {}'.format(
                feature,
                y_write_slice,
                x_write_slice,
                os.path.basename(out_file),
            )
        )

    def _collect_flist(
        self,
        feature,
        subset_masked_meta,
        time_index,
        shape,
        file_paths,
        out_file,
        target_masked_meta,
        max_workers=None,
    ):
        """Collect a dataset from a file list without getting attributes first.
        This file list can be a subset of a full file list to be collected.

        Parameters
        ----------
        feature : str
            Dataset name to collect.
        subset_masked_meta : pd.DataFrame
            Meta data containing the list of coordinates present in both the
            given file paths and the target_final_meta. This can be a subset of
            the coordinates present in the full file list. The coordinates
            contained in this dataframe have the same gids as those present in
            the meta for the full file list.
        time_index : pd.datetimeindex
            Concatenated datetime index for the given file paths.
        shape : tuple
            Output (collected) dataset shape
        file_paths : list | str
            File list to be collected. This can be a subset of a full file list
            to be collected.
        out_file : str
            File path of final output file.
        target_masked_meta : pd.DataFrame
            Same as subset_masked_meta but instead for the entire list of files
            to be collected.
        max_workers : int | None
            Number of workers to use in parallel. 1 runs serial,
            None uses all available.
        """
        if len(subset_masked_meta) > 0:
            attrs, final_dtype = self.get_dset_attrs(feature)
            scale_factor = attrs.get('scale_factor', 1)

            logger.debug(
                'Collecting file list of shape {}: {}'.format(
                    shape, file_paths
                )
            )

            self.data = np.zeros(shape, dtype=final_dtype)
            mem = psutil.virtual_memory()
            logger.debug(
                'Initializing output dataset "{}" in-memory with '
                'shape {} and dtype {}. Current memory usage is '
                '{:.3f} GB out of {:.3f} GB total.'.format(
                    feature,
                    shape,
                    final_dtype,
                    mem.used / 1e9,
                    mem.total / 1e9,
                )
            )

            if max_workers == 1:
                for i, fname in enumerate(file_paths):
                    logger.debug(
                        'Collecting data from file {} out of {}.'.format(
                            i + 1, len(file_paths)
                        )
                    )
                    self.get_data(
                        fname,
                        feature,
                        time_index,
                        subset_masked_meta,
                        scale_factor,
                        final_dtype,
                    )
            else:
                logger.info(
                    'Running parallel collection on {} workers.'.format(
                        max_workers
                    )
                )

                futures = {}
                completed = 0
                with ThreadPoolExecutor(max_workers=max_workers) as exe:
                    for fname in file_paths:
                        future = exe.submit(
                            self.get_data,
                            fname,
                            feature,
                            time_index,
                            subset_masked_meta,
                            scale_factor,
                            final_dtype,
                        )
                        futures[future] = fname
                    for future in as_completed(futures):
                        completed += 1
                        mem = psutil.virtual_memory()
                        logger.info(
                            'Collection futures completed: '
                            '{} out of {}. '
                            'Current memory usage is '
                            '{:.3f} GB out of {:.3f} GB total.'.format(
                                completed,
                                len(futures),
                                mem.used / 1e9,
                                mem.total / 1e9,
                            )
                        )
                        try:
                            future.result()
                        except Exception as e:
                            msg = 'Failed to collect data from '
                            msg += f'{futures[future]}'
                            logger.exception(msg)
                            raise RuntimeError(msg) from e
            self._write_flist_data(
                out_file,
                feature,
                time_index,
                subset_masked_meta,
                target_masked_meta,
            )
        else:
            msg = (
                'No target coordinates found in masked meta. Skipping '
                f'collection for {file_paths}.'
            )
            logger.warning(msg)
            warn(msg)

    def group_time_chunks(self, file_paths, n_writes=None):
        """Group files by temporal_chunk_index. Assumes file_paths have a
        suffix format like _{temporal_chunk_index}_{spatial_chunk_index}.h5

        Parameters
        ----------
        file_paths : list
            List of file paths each with a suffix
            _{temporal_chunk_index}_{spatial_chunk_index}.h5
        n_writes : int | None
            Number of writes to use for collection

        Returns
        -------
        file_chunks : list
            List of lists of file paths groups by temporal_chunk_index
        """
        file_split = {}
        for file in file_paths:
            t_chunk = file.split('_')[-2]
            file_split[t_chunk] = [*file_split.get(t_chunk, []), file]
        file_chunks = list(file_split.values())

        logger.debug(
            f'Split file list into {len(file_chunks)} chunks '
            'according to temporal chunk indices'
        )

        if n_writes is not None:
            msg = (
                f'n_writes ({n_writes}) must be less than or equal '
                f'to the number of temporal chunks ({len(file_chunks)}).'
            )
            assert n_writes <= len(file_chunks), msg
        return file_chunks

    def get_flist_chunks(self, file_paths, n_writes=None, join_times=False):
        """Get file list chunks based on n_writes

        Parameters
        ----------
        file_paths : list
            List of file paths to collect
        n_writes : int | None
            Number of writes to use for collection
        join_times : bool
            Option to split full file list into chunks with each chunk having
            the same temporal_chunk_index. The number of writes will then be
            min(number of temporal chunks, n_writes). This ensures that each
            write has all the spatial chunks for a given time index. Assumes
            file_paths have a suffix format
            _{temporal_chunk_index}_{spatial_chunk_index}.h5.  This is required
            if there are multiple writes and chunks have different time
            indices.

        Returns
        -------
        flist_chunks : list
            List of file list chunks. Used to split collection and writing into
            multiple steps.
        """
        if join_times:
            flist_chunks = self.group_time_chunks(
                file_paths, n_writes=n_writes
            )
        else:
            flist_chunks = [[f] for f in file_paths]

        if n_writes is not None:
            flist_chunks = np.array_split(flist_chunks, n_writes)
            flist_chunks = [
                np.concatenate(fp_chunk) for fp_chunk in flist_chunks
            ]
            logger.debug(
                f'Split file list into {len(flist_chunks)} '
                f'chunks according to n_writes={n_writes}'
            )
        return flist_chunks

    @classmethod
    def collect(
        cls,
        file_paths,
        out_file,
        features,
        max_workers=None,
        log_level=None,
        log_file=None,
        write_status=False,
        job_name=None,
        pipeline_step=None,
        join_times=False,
        target_final_meta_file=None,
        n_writes=None,
        overwrite=True,
        threshold=1e-4,
    ):
        """Collect data files from a dir to one output file.

        Filename requirements:
         - Should end with ".h5"

        Parameters
        ----------
        file_paths : list | str
            Explicit list of str file paths that will be sorted and collected
            or a single string with unix-style /search/patt*ern.h5.
        out_file : str
            File path of final output file.
        features : list
            List of dsets to collect
        max_workers : int | None
            Number of workers to use in parallel. 1 runs serial,
            None will use all available workers.
        log_level : str | None
            Desired log level, None will not initialize logging.
        log_file : str | None
            Target log file. None logs to stdout.
        write_status : bool
            Flag to write status file once complete if running from pipeline.
        job_name : str
            Job name for status file if running from pipeline.
        pipeline_step : str, optional
            Name of the pipeline step being run. If ``None``, the
            ``pipeline_step`` will be set to the ``"collect``,
            mimicking old reV behavior. By default, ``None``.
        join_times : bool
            Option to split full file list into chunks with each chunk having
            the same temporal_chunk_index. The number of writes will then be
            min(number of temporal chunks, n_writes). This ensures that each
            write has all the spatial chunks for a given time index. Assumes
            file_paths have a suffix format
            _{temporal_chunk_index}_{spatial_chunk_index}.h5.  This is required
            if there are multiple writes and chunks have different time
            indices.
        target_final_meta_file : str
            Path to target final meta containing coordinates to keep from the
            full file list collected meta. This can be but is not necessarily a
            subset of the full list of coordinates for all files in the file
            list. This is used to remove coordinates from the full file list
            which are not present in the target_final_meta. Either this full
            meta or a subset, depending on which coordinates are present in
            the data to be collected, will be the final meta for the collected
            output files.
        n_writes : int | None
            Number of writes to split full file list into. Must be less than
            or equal to the number of temporal chunks if chunks have different
            time indices.
        overwrite : bool
            Whether to overwrite existing output file
        threshold : float
            Threshold distance for finding target coordinates within full meta
        """
        t0 = time.time()

        logger.info(
            f'Initializing collection for file_paths={file_paths}, '
            f'with max_workers={max_workers}.'
        )

        if log_level is not None:
            init_logger(
                'sup3r.preprocessing', log_file=log_file, log_level=log_level
            )

        if not os.path.exists(os.path.dirname(out_file)):
            os.makedirs(os.path.dirname(out_file), exist_ok=True)

        collector = cls(file_paths)
        logger.info(
            'Collecting {} files to {}'.format(len(collector.flist), out_file)
        )
        if overwrite and os.path.exists(out_file):
            logger.info(f'overwrite=True, removing {out_file}.')
            os.remove(out_file)

        out = collector.get_collection_attrs(
            collector.flist,
            max_workers=max_workers,
            target_final_meta_file=target_final_meta_file,
            threshold=threshold,
        )
        time_index, target_final_meta, target_masked_meta = out[:3]
        shape, global_attrs = out[3:]

        for _, dset in enumerate(features):
            logger.debug('Collecting dataset "{}".'.format(dset))
            if join_times or n_writes is not None:
                flist_chunks = collector.get_flist_chunks(
                    collector.flist, n_writes=n_writes, join_times=join_times
                )
            else:
                flist_chunks = [collector.flist]

            if not os.path.exists(out_file):
                collector._init_h5(
                    out_file, time_index, target_final_meta, global_attrs
                )

            if len(flist_chunks) == 1:
                collector._collect_flist(
                    dset,
                    target_masked_meta,
                    time_index,
                    shape,
                    flist_chunks[0],
                    out_file,
                    target_masked_meta,
                    max_workers=max_workers,
                )

            else:
                for j, flist in enumerate(flist_chunks):
                    logger.info(
                        'Collecting file list chunk {} out of {} '.format(
                            j + 1, len(flist_chunks)
                        )
                    )
                    (
                        time_index,
                        target_final_meta,
                        masked_meta,
                        shape,
                        _,
                    ) = collector.get_collection_attrs(
                        flist,
                        max_workers=max_workers,
                        target_final_meta_file=target_final_meta_file,
                        threshold=threshold,
                    )
                    collector._collect_flist(
                        dset,
                        masked_meta,
                        time_index,
                        shape,
                        flist,
                        out_file,
                        target_masked_meta,
                        max_workers=max_workers,
                    )

        if write_status and job_name is not None:
            status = {
                'out_dir': os.path.dirname(out_file),
                'fout': out_file,
                'flist': collector.flist,
                'job_status': 'successful',
                'runtime': (time.time() - t0) / 60,
            }
            pipeline_step = pipeline_step or 'collect'
            Status.make_single_job_file(
                os.path.dirname(out_file), pipeline_step, job_name, status
            )

        logger.info('Finished file collection.')
