import numpy as np
import pandas as pd
from scipy.stats import lognorm, genpareto, norm


def max_out_of_n_from_dist(dist, out_of_n=1e9):
    manageable_n = 100000
    if out_of_n <= manageable_n:
        return dist.rvs(out_of_n).max()
    else:
        top_percentiles = \
            np.random.rand(manageable_n) * manageable_n / out_of_n
        return min(
            dist.isf(top_percentiles.min()), 
            np.finfo(float).max / 100) # prevent inf


def norm_redraw(mu, sigma):
    draw = int(round(norm(mu, scale=sigma).rvs(1)[0]))
    if draw <= -1:
        return norm_redraw(mu, sigma)
    else:
        return draw

def calculate_bf(
    dist_type, #genpareto or lognorm
    shape_params_dist, #np.geomspace or np.linspace
    sample_size, #1e9 to 1e10+
    greater_by, #6 to 10+
    outlier_interval, # (0.5, 1) to (0.5, 0.9)
    n_historical_outliers, #60 to 300+
    n_max_draws=10000,
):
    
    if dist_type == genpareto:
        shape_limits = [0.01, 2.0]
    elif dist_type == lognorm:
        shape_limits = [0.1, 10.0]
    shape_params_list = shape_params_dist(
        shape_limits[0], shape_limits[1], 200)
        
    shape_params = []
    pdf_greater_by = []
    n_outliers = []
    
    for shape_param in shape_params_list:
        dist = dist_type(shape_param, scale=1, loc=0)
        for i in range(n_max_draws):            
            max_val = max_out_of_n_from_dist(dist, sample_size)
            scaled_dist = dist_type(
                shape_param, scale=1 / max_val, loc=0)
            # approximate binom with norm
            p_outlier = (
                scaled_dist.sf(outlier_interval[0]) 
                - scaled_dist.sf(outlier_interval[1])
            ) / scaled_dist.cdf(1)
            mu = (sample_size - 1) * p_outlier
            sigma = (
                (sample_size - 1) 
                * p_outlier * (1 - p_outlier)) ** 0.5
            n_outlier = norm_redraw(mu, sigma) + 1
            if n_outlier == n_historical_outliers:
                n_outliers.append(n_outlier)
                shape_params.append(shape_param)
                pdf_greater_by.append(scaled_dist.pdf(greater_by))

    result_df = pd.DataFrame({
        "shape_params":shape_params, 
        "pdf_greater_by":pdf_greater_by,
        "n_outliers":n_outliers,
    })
    
    if result_df.shape[0] < 50:
        print("warning: result_df.shape = ", result_df.shape)
    if result_df["shape_params"].max() == shape_params_list.max():
        print("warning: maxed out shape_param")
    
    bf = (0.5 / 12) / result_df["pdf_greater_by"].mean()
    print("Bayes factor: {:.2e}".format(bf))


calculate_bf(
    dist_type=genpareto,
    shape_params_dist=np.linspace,
    sample_size=int(1e9),
    greater_by=6,
    outlier_interval=(0.5, 1),
    n_historical_outliers=60,
)

Bayes factor: 1.06e+11


calculate_bf(
    dist_type=genpareto,
    shape_params_dist=np.linspace,
    sample_size=1e10,
    greater_by=7,
    outlier_interval=(0.5, 1),
    n_historical_outliers=200,
)

Bayes factor: 1.12e+14


calculate_bf(
    dist_type=lognorm,
    shape_params_dist=np.geomspace,
    sample_size=int(1e10),
    greater_by=8,
    outlier_interval=(0.5, 1),
    n_historical_outliers=300,
)

Bayes factor: 8.41e+15


calculate_bf(
    dist_type=lognorm,
    shape_params_dist=np.linspace,
    sample_size=int(1e10),
    greater_by=8,
    outlier_interval=(0.5, 0.9),
    n_historical_outliers=200,
)

Bayes factor: 3.80e+14


calculate_bf(
    dist_type=genpareto,
    shape_params_dist=np.linspace,
    sample_size=int(2e10),
    greater_by=10,
    outlier_interval=(0.5, 0.95),
    n_historical_outliers=250,
)

Bayes factor: 6.12e+15


calculate_bf(
    dist_type=lognorm,
    shape_params_dist=np.geomspace,
    sample_size=int(2e10),
    greater_by=10,
    outlier_interval=(0.5, 1),
    n_historical_outliers=200,
)

Bayes factor: 1.14e+16