Source code for idf_analysis.sww_utils

__author__ = "David Camhy, Markus Pichler"
__credits__ = ["David Camhy", "Markus Pichler"]
__license__ = "MIT"
__maintainer__ = "Markus Pichler"

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset

from .definitions import COL



[docs]
class IdfError(Exception):
    """Some Error Within this Package"""



########################################################################################################################

[docs]
def guess_freq(date_time_index, default=pd.Timedelta(minutes=1)):
    """
    guess the frequency by evaluating the most often frequency

    Args:
        date_time_index (pandas.DatetimeIndex): index of a time-series
        default (pandas.Timedelta):

    Returns:
        pandas.DateOffset: frequency of the date-time-index
    """
    freq = date_time_index.freq
    if pd.notnull(freq):
        return to_offset(freq)

    if not len(date_time_index) <= 3:
        freq = pd.infer_freq(date_time_index)  # 'T'

        if pd.notnull(freq):
            return to_offset(freq)

        delta_series = date_time_index.to_series().diff(periods=1).bfill()  # .fillna(method='backfill')
        counts = delta_series.value_counts()
        counts.drop(pd.Timedelta(minutes=0), errors='ignore')

        if counts.empty:
            delta = default
        else:
            delta = counts.index[0]
            if delta == pd.Timedelta(minutes=0):
                delta = default
    else:
        delta = default

    return to_offset(delta)



########################################################################################################################
def year_delta(years):
    return pd.Timedelta(days=365.25 * years)


########################################################################################################################

[docs]
def rain_events(series, ignore_rain_below=0.01, min_gap=pd.Timedelta(hours=4)):
    """
    get rain events as a table with start and end times

    Args:
        series (pandas.Series): rain series
        ignore_rain_below (float): where it is considered as rain
        min_gap (pandas.Timedelta): 4 hours of no rain between events

    Returns:
        pandas.DataFrame: table of the rain events
    """
    # best OKOSTRA adjustment with 0.0
    # by ignoring 0.1 mm the results are getting bigger

    # remove values below a from the database
    temp = series[series >= ignore_rain_below].index.to_series()

    if temp.empty:
        return pd.DataFrame()

    # 4 hours of no rain between events
    bool_end = temp.diff(periods=-1) < -min_gap
    bool_end.iloc[-1] = True

    bool_start = temp.diff() > min_gap
    bool_start.iloc[0] = True

    events = pd.DataFrame.from_dict({
        COL.START: temp[bool_start].to_list(),
        COL.END: temp[bool_end].to_list(),
    })
    return events




[docs]
def event_number_to_series(events, index):
    """
    make a time-series where the value of the event number is paste to the <index>

    Args:
        events (pandas.DataFrame):
        index (pandas.DatetimeIndex):

    Returns:
        pandas.Series:
    """
    ts = pd.Series(index=index)

    events_dict = events.to_dict(orient='index')
    for event_no, event in events_dict.items():
        ts[event[COL.START]: event[COL.END]] = event_no

    return ts



########################################################################################################################

[docs]
def agg_events(events, series, agg='sum'):
    """

    Args:
        events (pandas.DataFrame): table of events
        series (pandas.Series): time-series data
        agg (str | function): aggregation of time-series

    Returns:
        numpy.ndarray: result of function of every event
    """
    if events.empty:
        return np.array([])

    if events.index.size > 3500:
        res = series.groupby(event_number_to_series(events, series.index)).agg(agg).values
    else:
        # res = []
        # for _, event in events.iterrows():
        #     res.append(series[event[COL.START]:event[COL.END]].agg(agg))
        res = events.apply(lambda event: series[event[COL.START]:event[COL.END]].agg(agg), axis=1).values
    return res



########################################################################################################################

[docs]
def event_duration(events):
    """
    calculate the event duration

    Args:
        events (pandas.DataFrame): table of events with COL.START and COL.END times

    Returns:
        pandas.Series: duration of each event
    """
    return events[COL.END] - events[COL.START]



########################################################################################################################

[docs]
def rain_bar_plot(rain, ax=None, color='#1E88E5', reverse=False, step='post', joinstyle='miter', capstyle='butt'):
    """
    Make a standard precipitation/rain plot.

    Args:
        rain (pandas.Series):
        ax (matplotlib.axes.Axes):
        color (str):
        reverse (bool):
        step (str):  'mid' 'post' pre'

    Returns:
        matplotlib.axes.Axes: rain plot
    """
    if rain.size == 1:
        freq_step = pd.Timedelta(rain.index.freq)
        rain = rain.reindex(pd.date_range(rain.index[0]-freq_step, periods=3, freq=rain.index.freq))
    ax = rain.plot(ax=ax, drawstyle=f'steps-{step}', color=color, solid_capstyle=capstyle, solid_joinstyle=joinstyle,
                   lw=0)
    ax.fill_between(rain.index, rain.values, 0, step=step, zorder=1000, color=color, capstyle=capstyle,
                    joinstyle=joinstyle)

    if reverse:
        # ax.set_ylim(top=0, bottom=rain.max() * 1.1)
        ax.set_ylim(bottom=0)
        ax.invert_yaxis()
    else:
        ax.set_ylim(bottom=0)

    return ax



########################################################################################################################

[docs]
def resample_rain_series(series):
    """
    Resamples a rain time-series to an appropriate frequency based on the duration of the series.

    The function determines the optimal resampling frequency (in minutes) by comparing the total duration
    of the series against predefined thresholds. If the original frequency is already finer than the
    calculated optimal frequency, the series is returned unchanged. Otherwise, the series is resampled
    to the new frequency by summing the values within each interval.

    Args:
        series (pandas.Series): A time-series of rain data, indexed by a pandas.DatetimeIndex.
                                The series should contain numeric values representing rain amounts.

    Returns:
        tuple[pandas.Series, int]: A tuple containing:
            - The resampled time-series (if resampling was applied) or the original series (if no resampling was needed).
            - The final frequency of the series in minutes (e.g., 5 for 5-minute intervals).

    Notes:
        - The resampling thresholds are defined as follows:
            - Duration < 5 hours: 1-minute frequency.
            - Duration < 12 hours: 2-minute frequency.
            - Duration < 1 day: 5-minute frequency.
            - Duration < 2 days: 10-minute frequency.
            - Duration < 3 days: 15-minute frequency.
            - Duration < 4 days: 20-minute frequency.
        - If the original frequency of the series is finer than the calculated optimal frequency,
          the series is returned unchanged, and the original frequency is converted to minutes.
        - Resampling is performed using the sum of values within each interval to preserve the total rain amount.
    """
    resample_minutes = (
        (pd.Timedelta(hours=5), 1),
        (pd.Timedelta(hours=12), 2),
        (pd.Timedelta(days=1), 5),
        (pd.Timedelta(days=2), 10),
        (pd.Timedelta(days=3), 15),
        (pd.Timedelta(days=4), 20)
    )

    dur = series.index[-1] - series.index[0]
    freq = guess_freq(series.index)

    minutes = 1
    for duration_limit, minutes in resample_minutes:
        if dur < duration_limit:
            break

    if pd.Timedelta(freq) > pd.Timedelta(minutes=minutes):
        return series, int(freq / pd.Timedelta(minutes=1))

    # print('resample_rain_series: ', dur, duration_limit, minutes)
    return series.resample(f'{minutes}min').sum(), minutes