Source code for pysgems.dis.sgdis

#  Copyright (c) 2020. Robin Thibaut, Ghent University

import os
import shutil
import time
from os.path import join as jp

import numpy as np
import pandas as pd
from loguru import logger

from pysgems.base.packbase import Package


[docs]def blocks_from_rc(
    rows: np.array,
    columns: np.array,
    layers: np.array,
    xo: float = 0,
    yo: float = 0,
    zo: float = 0,
):
    """
    Yields blocks defining grid cells

    :param rows: array of x-widths along a row
    :param columns: array of y-widths along a column
    :param layers: array of z-widths along a column
    :param xo: x origin
    :param yo: y origin
    :param zo: z origin
    :return: generator of (cell node number, block vertices coordinates, block center)
    """
    nrow = len(rows)
    ncol = len(columns)
    nlay = len(layers)
    delr = rows
    delc = columns
    dell = layers
    r_sum = np.cumsum(delr) + yo
    c_sum = np.cumsum(delc) + xo
    l_sum = np.cumsum(dell) + zo

    def get_node(r: int, c: int, h: int) -> int:
        """
        Get node index to fix hard data

        :param r: row number
        :param c: column number
        :param h: layer number
        :return: node number
        """
        nrc = nrow * ncol
        return int((h * nrc) + (r * ncol) + c)

    for k in range(nlay):
        for i in range(nrow):
            for j in range(ncol):
                b = [
                    [c_sum[j] - delc[j], r_sum[i] - delr[i], l_sum[k] - dell[k]],
                    [c_sum[j], r_sum[i] - delr[i], l_sum[k] - dell[k]],
                    [c_sum[j] - delc[j], r_sum[i], l_sum[k] - dell[k]],
                    [c_sum[j], r_sum[i], l_sum[k] - dell[k]],
                    [c_sum[j] - delc[j], r_sum[i] - delr[i], l_sum[k]],
                    [c_sum[j], r_sum[i] - delr[i], l_sum[k]],
                    [c_sum[j] - delc[j], r_sum[i], l_sum[k]],
                    [c_sum[j], r_sum[i], l_sum[k]],
                ]
                yield get_node(i, j, k), np.array(b), np.mean(b, axis=0)


[docs]class Discretize(Package):
[docs]    def __init__(
        self,
        project,
        dx: float = 1,
        dy: float = 1,
        dz: float = 1,
        xo: float = None,
        yo: float = None,
        zo: float = None,
        x_lim: float = None,
        y_lim: float = None,
        z_lim: float = None,
    ):
        """
        Constructs the grid geometry. The user can not control directly the number of rows and columns
        but instead inputs the cell size in x and y dimensions.
        xo, yo, x_lim, y_lim, defining the bounding box of the grid, are None by default, and are computed
        based on the data points distribution.

        :param project: sgems project
        :param dx: cell size in x dimension
        :param dy: cell size in y dimension
        :param dz: cell size in z dimension
        :param xo: x origin
        :param yo: y origin
        :param zo: z origin
        :param x_lim: x limit
        :param y_lim: y limit
        :param z_lim: z limit
        """

        Package.__init__(self, project)

        self.cell_file = None

        self.dx = dx
        self.dy = dy
        self.dz = dz
        if self.parent.point_set is not None:
            if self.parent.point_set.dimension == 2:
                self.dz = 0

        # Grid origins
        # If a PointSet is already loaded into the project, automatically defines the limits
        # (if no limits are given)
        if xo is None:
            if self.parent.point_set is not None:
                xs = self.parent.point_set.dataframe["x"]
                xo = np.min(xs) - 4 * self.dx
            else:
                xo = 0

        if yo is None:
            if self.parent.point_set is not None:
                ys = self.parent.point_set.dataframe["y"]
                yo = np.min(ys) - 4 * self.dy
            else:
                yo = 0

        if zo is None:
            if self.parent.point_set is not None:
                zs = self.parent.point_set.dataframe["z"]
                zo = np.min(zs) - 4 * self.dz
            else:
                zo = 0

        # Grid limits
        if x_lim is None:
            if self.parent.point_set is not None:
                xs = self.parent.point_set.dataframe["x"]
                x_lim = np.max(xs) + 4 * self.dx
            else:
                x_lim = 1

        if y_lim is None:
            if self.parent.point_set is not None:
                ys = self.parent.point_set.dataframe["y"]
                y_lim = np.max(ys) + 4 * self.dy
            else:
                y_lim = 1

        if z_lim is None:
            if self.parent.point_set is not None:
                zs = self.parent.point_set.dataframe["z"]
                z_lim = np.max(zs) + 4 * self.dz
            else:
                z_lim = 1

        # Cell dimensions
        if self.dy > 0:
            nrow = abs(int((y_lim - yo) // self.dy))  # Number of rows
        else:
            nrow = 1
        if self.dx > 0:
            ncol = abs(int((x_lim - xo) // self.dx))  # Number of columns
        else:
            ncol = 1
        if self.dz > 0:
            nlay = abs(int((z_lim - zo) // self.dz))  # Number of layers
        else:
            nlay = 1

        # Size of each cell along y-dimension - rows
        along_r = np.ones(ncol) * self.dx * np.sign(x_lim)
        # Size of each cell along x-dimension - columns
        along_c = np.ones(nrow) * self.dy * np.sign(y_lim)
        # Size of each cell along x-dimension - columns
        along_l = np.ones(nlay) * self.dz * np.sign(z_lim)

        self.xo = xo
        self.yo = yo
        self.zo = zo

        self.x_lim = x_lim
        self.y_lim = y_lim
        self.z_lim = z_lim

        self.nrow = nrow
        self.ncol = ncol
        self.nlay = nlay

        self.along_r = along_r
        self.along_c = along_c
        self.along_l = along_l

        setattr(self.parent, "dis", self)

[docs]    def my_cell(self, xyz: np.array):
        """
        Given a point coordinate xyz [x, y, z], computes its cell number by computing the euclidean distance of each cell
        center.

        :param xyz:  x, y, z coordinate of data point
        :return: cell number
        """
        start = time.time()
        rn = np.array(xyz)
        # first check if point is within the grid

        # crit = 0
        # if np.all(rn >= np.array([self.xo, self.yo, self.zo])) and np.all(
        #     rn <= np.array([self.x_lim, self.y_lim, self.z_lim])
        # ):
        #     crit = 1

        crit = 1
        if crit:  # if point inside
            if self.parent.point_set.dimension == 3:  # if 3D
                # minimum distance under which a point is in a cell
                dmin = np.min([self.dx, self.dy, self.dz]) / 2
            else:  # if 2D
                # minimum distance under which a point is in a cell
                dmin = np.min([self.dx, self.dy]) / 2

            blocks = blocks_from_rc(
                self.along_c, self.along_r, self.along_l, self.xo, self.yo, self.zo
            )  # cell generator

            # mapping data points to cells:
            # slow but memory-effective method
            vmin = np.inf
            cell = None
            for b in blocks:
                c = b[2]
                dc = np.linalg.norm(rn - c)  # Euclidean distance
                if dc <= dmin:  # If point is inside cell
                    logger.info("found 1 cell id in {} s".format(time.time() - start))
                    return b[0]
                if dc < vmin:
                    vmin = dc
                    cell = b[0]
            logger.info("found 1 cell id in {} s".format(time.time() - start))
            return cell
        else:
            return self.parent.nodata

[docs]    def compute_cells(self, xyz: np.array):
        """
        Determines cell location for each data point.
        It is necessary to know the cell number to assign the hard data property to the sgems grid.

        :param xyz: Data points x, y, z coordinates
        """

        nodes = np.array([self.my_cell(c) for c in xyz])

        # Save to nodes to avoid recomputing each time
        np.save(self.cell_file, nodes)

[docs]    def write_hard_data(
        self,
        subdata: pd.DataFrame,
        dis_file: str = None,
        cell_file: str = None,
        output_dir: str = None,
    ):
        """
        Removes no-data rows from data frame and compute the mean of data points sharing the same cell.
        Export the list of shape (n features, m cells, 2) containing the node of each point data with the corresponding
        value, for each feature.

        :param dis_file: path to the dis file
        :param cell_file: path to the cell file
        :param subdata: Pandas dataframe whose columns are the values of features to save as hard data.
        :param output_dir: Directory where hard data lists will be saved
        :return: Filtered list of each attribute
        """

        if cell_file is not None:
            self.cell_file = cell_file  # path to the cell file

        if self.cell_file is None:
            self.cell_file = jp(
                self.parent.res_dir, "cells.npy"
            )  # path to the cell file

        if output_dir is None:
            output_dir = self.parent.res_dir  # path to the parent directory

        if dis_file is None:
            dis_file = jp(
                os.path.dirname(self.cell_file), "grid.dis"
            )  # path to the dis file

        xyz = subdata[["x", "y", "z"]].to_numpy()  # coordinates of data points

        npar = np.array(
            [
                self.dx,
                self.dy,
                self.dz,
                self.xo,
                self.yo,
                self.zo,
                self.x_lim,
                self.y_lim,
                self.z_lim,
                self.nrow,
                self.ncol,
                self.nlay,
            ]
        )  # grid parameters

        if os.path.isfile(dis_file):  # Check previous grid info
            pdis = np.loadtxt(dis_file)  # Load previous grid info
            # If different, recompute data points cell by deleting previous cell file
            if not np.array_equal(pdis, npar):
                logger.info("New grid found")
                try:
                    os.remove(self.cell_file)
                except FileNotFoundError:
                    pass
                finally:
                    np.savetxt(dis_file, npar)
                    self.compute_cells(xyz)  # Compute cell number for each data point
            else:
                logger.info("Using previous grid")
                if not os.path.exists(self.cell_file):
                    self.compute_cells(xyz)  # Compute cell number for each data point
        else:
            np.savetxt(dis_file, npar)  # Save grid info
            self.compute_cells(xyz)  # Compute cell number for each data point

        data_nodes = np.load(self.cell_file)  # Load cell number for each data point
        unique_nodes = list(set(data_nodes))  # List of unique cell numbers

        h = subdata.columns.values[-1]  # Name of the attribute to save as hard data
        hd = []
        # fixed nodes = [[node i, value i]....]
        fixed_nodes = np.array(
            [[data_nodes[dn], subdata[h][dn]] for dn in range(len(data_nodes))]
        )  # List of cell number and corresponding value
        # Deletes points where val == nodata
        hard_data = np.delete(
            fixed_nodes, np.where(fixed_nodes == self.parent.nodata)[0], axis=0
        )  # Delete no-data points
        # If data points share the same cell, compute their mean and assign the value to the cell
        for n in unique_nodes:
            where = np.where(hard_data[:, 0] == n)[0]
            if len(where) > 1:  # If more than 1 point per cell
                mean = np.mean(hard_data[where, 1])
                hard_data[where, 1] = mean

            fn = hard_data.tolist()
            # For each, feature X, saves a file X.hard
            cell_values_name = jp(os.path.dirname(self.cell_file), f"{h}.hard")
            with open(cell_values_name, "w") as nd:
                nd.write(repr(fn))  # Save list of cell number and corresponding value
            try:
                shutil.copyfile(
                    cell_values_name,
                    cell_values_name.replace(
                        os.path.dirname(cell_values_name), output_dir
                    ),
                )
            except shutil.SameFileError:
                pass

        setattr(self.parent, "hard_data", hd.append(h))