moved into old_concentration_test.py

2024-06-27 17:23:05 +02:00 · 2024-06-27 17:23:05 +02:00 · 0835eff420
commit 0835eff420
parent 87ae317c31
1 changed files with 0 additions and 425 deletions
--- a/src/concentration_test.py
+++ b/src/concentration_test.py
@ -1,425 +0,0 @@
-import matplotlib.pyplot as plt
-from matplotlib.colors import CSS4_COLORS
-import numpy as np
-from scipy.stats import norm as Norm, beta as Beta, t as Student
-from tprint import tprint
-import orderankings as odrk
-from querying import find_orderings
-from kemeny_young import kendall_tau_dist, rank_aggregation
-from tqdm import tqdm
-from collections import Counter, defaultdict
-import joblib
-from functools import partial
-import random
-
-# Random number generator for the whole program
-RNG = np.random.default_rng(1234)
-
-VERBOSE = True
-VERBOSE = False
-
-
-################## DATA SETTINGS (parameters, hypothesis...) ###################
-
-# """ comment this line when using the SSB dataset
-# SSB dataset settings  # {{{
-
-PARAMETER = "p_color"
-SUMMED_ATTRIBUTE = "lo_quantity"
-# SUMMED_ATTRIBUTE = "lo_revenue"
-# SUMMED_ATTRIBUTE = "lo_extendedprice"
-LENGTH = 2
-
-authorized_parameter_values = {
-        "p_size": tuple(map(int, range(50))),
-        "p_color": tuple(CSS4_COLORS.keys()),
-        }
-AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER]
-
-CRITERION = (
-        ##### customer table
-        # "c_region",
-        "c_city",
-        # "c_nation",
-
-        ##### part table
-        "p_category",
-        "p_brand",
-        # "p_mfgr",
-        # "p_color",
-        # "p_type",
-        # "p_container",
-
-        ##### supplier table
-        "s_city",
-        # "s_nation",
-        # "s_region",
-
-        ##### order date
-        # "D_DATE",
-        # "D_DATEKEY",
-        # "D_DATE",
-        # "D_DAYOFWEEK",
-        # "D_MONTH",
-        # "D_YEAR",
-        # "D_YEARMONTHNUM",
-        # "D_YEARMONTH",
-        # "D_DAYNUMINWEEK"
-        # "D_DAYNUMINMONTH",
-        # "D_DAYNUMINYEAR",
-        # "D_MONTHNUMINYEAR",
-        # "D_WEEKNUMINYEAR",
-        # "D_SELLINGSEASON",
-        # "D_LASTDAYINWEEKFL",
-        # "D_LASTDAYINMONTHFL",
-        # "D_HOLIDAYFL",
-        # "D_WEEKDAYFL",
-    )
-
-HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
-HYPOTHESIS_ORDERING = ("bisque", "blue")
-
-# HYPOTHESIS_ORDERING = [2, 32]
-# HYPOTHESIS_ORDERING = [30, 18]
-# HYPOTHESIS_ORDERING = [37, 49, 10]
-
-
-# }}}
-""" # flight_delay dataset settings {{{
-
-PARAMETER = "departure_airport"
-SUMMED_ATTRIBUTE = "nb_flights"
-LENGTH = 3
-
-CRITERION = (
-        # "airline",
-        "departure_hour",  # simpson's paradox ?
-        # "day",
-        # "month",
-        # "year",
-        )
-
-
-GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS',
-                   'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA',
-                   'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK']
-AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING
-
-
-# Correct hypothesis for each length (so the loss converges to 0)
-CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING)
-CORRECT_ORDERINGS[2] = ['ATL', 'DEN']
-CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD']
-CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD']
-CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX']
-# now select the right one according to the LENGTH
-CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH]
-
-# Use the correct ordering
-HYPOTHESIS_ORDERING = CORRECT_ORDERING
-print(HYPOTHESIS_ORDERING)
-
-
-# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX']
-# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH]
-# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH]
-# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX']  # interesting loss curve
-
-assert len(HYPOTHESIS_ORDERING) == LENGTH
-
-# }}}
-# """
-
-
-def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
-    """This loss is the the average of kendall tau distances between the truth
-    and each ordering."""
-    rankings = odrk.rankings_from_orderings(orderings)
-    true_ranking = odrk.rankings_from_orderings([truth])[0]
-    return rankings_average_loss(rankings, true_ranking)# }}}
-
-
-def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
-    distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
-    length = len(rankings)
-    # apparently, this is what works for a good normalization
-    return distance / length
-    # return distance * 2 / (length * (length - 1))}}}
-
-
-def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
-    """Return the kendall tau distance between the truth and the kemeny-young
-    aggregation of orderings"""
-    _, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
-    aggregation = odrk.ordering_from_ranking(agg_rank, truth)
-    loss = kendall_tau_dist(
-            odrk.ranking_from_ordering(aggregation),
-            odrk.ranking_from_ordering(truth))
-    return loss
-    # print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
-
-
-def get_loss_progression(): # {{{
-    grouped_orderings = find_orderings(parameter=PARAMETER,
-                                        summed_attribute=SUMMED_ATTRIBUTE,
-                                        criterion=CRITERION,
-                                        length=LENGTH)
-    RNG.shuffle(grouped_orderings)
-
-    average_losses = []
-    kendal_aggregation_losses = []
-
-    for nb_considered_orderings in range(1, len(grouped_orderings)+1):
-        # loss as the average distance from truth to all considered orderings
-        considered_orderings = grouped_orderings[:nb_considered_orderings]
-        loss = orderings_average_loss(orderings=considered_orderings,
-                                      truth=HYPOTHESIS_ORDERING)
-
-        # loss as the distance between truth and the aggregation
-        kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
-                                      truth=HYPOTHESIS_ORDERING)
-        kendal_aggregation_losses.append(kdl_agg_loss)
-
-        if VERBOSE:
-            print(f"using {nb_considered_orderings} orderings")
-            tprint(considered_orderings)
-            print("truth :", HYPOTHESIS_ORDERING)
-            print("loss =", loss)
-        average_losses.append(loss)
-    return average_losses, kendal_aggregation_losses
-    # }}}
-
-def plot_loss_progression(): # {{{
-    """Plot the progression of losses when using more and more of the values
-    (see get_loss_progression)."""
-    N = 20
-
-    avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
-    avg_loss_progression = np.array(avg_loss_progression)
-    kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
-
-    for _ in tqdm(range(N-1), leave=False):
-        avg_lp, kmny_lp = get_loss_progression()
-        avg_loss_progression += avg_lp
-        kdl_agg_loss_progression += kmny_lp
-        # print(progression)
-    if VERBOSE:
-        print(avg_loss_progression)
-        print(kdl_agg_loss_progression)
-    plt.plot(avg_loss_progression, color="orange")
-    plt.plot(kdl_agg_loss_progression, color="green")
-    # }}}
-
-def get_mode_loss_progression(all_orderings: list[list[str]],
-                              number_of_steps: int,
-                              orders_added_each_step: int =1) -> list[bool]:
-
-    # all_rankings = odrk.rankings_from_orderings(all_orderings)
-
-    # considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
-    considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
-    # count occurrences of each ordering
-    orderings_count = Counter(map(tuple, considered_orderings))
-
-    # loss progression when adding more and more orderings
-    loss_history = np.zeros(number_of_steps)
-
-    # # random permutation of the orderings
-    # permuted_orderings = np.random.permutation(all_orderings)
-
-    for idx in range(number_of_steps):
-        # new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
-        new_orders = random.choices(all_orderings, k=orders_added_each_step)
-        # new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
-
-        # considered_orderings.extend(new_orders)
-        # update the counter of orderings occurrences
-        orderings_count.update(Counter(map(tuple, new_orders)))
-        # the most common (modal) ordering
-        modal_ordering = orderings_count.most_common()[0][0]
-        modal_ordering = np.array(modal_ordering)
-        # if VERBOSE: print(modal_ordering)
-        # the loss is 1 if the modal ordering is the same as the hypothesis
-        loss =  int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
-        # loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
-        # loss = int(all(map(lambda x: x[0]==x[1],
-        #                    zip(modal_ordering, HYPOTHESIS_ORDERING))))
-        # add loss to the list of losses
-        loss_history[idx] = loss
-    if VERBOSE:
-        # print(loss_history, HYPOTHESIS_ORDERING)
-        print(orderings_count.most_common(1)[0])
-    return np.repeat(loss_history, orders_added_each_step)
-
-
-################################################################################
-
-def plot_modal_losses():
-    ###################
-    # sampling settings
-    N = 100  # number of repetitions of the experiment
-    max_number_of_orders = 7500  # max sample size
-    GRANULARITY = 12  # granularity of the sampling (orders by iteration)
-
-    number_of_steps = max_number_of_orders // GRANULARITY
-
-    all_orderings = find_orderings(
-            parameter=PARAMETER,
-            summed_attribute=SUMMED_ATTRIBUTE,
-            criterion=CRITERION,
-            length=LENGTH,
-            authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
-
-    print(f"there are {all_orderings.size} orders in total :")
-    tprint(all_orderings, limit=10)
-
-
-    # make get_mode_loss_progression parallelizable
-    gmlp = joblib.delayed(get_mode_loss_progression)
-
-    ####
-    # Aggregate multiple simulations
-
-    # don't use the tqdm progress bar if there are some logs
-    range_N = range(N) if VERBOSE else tqdm(range(N))
-
-    # for my 8-core computer, n_jobs=7 is empirically the best value
-    loss_history = joblib.Parallel(n_jobs=7)(
-            gmlp(all_orderings,
-                 number_of_steps,
-                 orders_added_each_step=GRANULARITY)
-            for _ in range_N
-            )
-    loss_history = np.array(loss_history)
-
-    # the sum of losses for each number of steps
-    losses = np.sum(loss_history, axis=0)
-
-    if VERBOSE: print("losses :", losses, sep="\n")
-
-    #####
-    # average
-    # since losses is the sum of losses, losses/N is the average
-    mean = losses / N
-    plt.plot(mean, color="green", label="loss average")
-
-    #####
-    # standard deviation
-    # variance is (average of squares) - (square of the average)
-    # since we only have 1 or 0, average of squares is just the average
-    # so the variance is average - average**2
-    # stddev is the square root of variance
-    stddev = np.sqrt(mean - mean**2)
-    plt.plot(stddev, color="grey", label="loss standard deviation")
-
-
-
-    ############################################################################
-    # CONFIDENCE INTERVALS
-
-    X = np.arange(mean.size)  # the x axis
-
-    ######
-    ## confidence interval
-    ## assuming the experimental variance is the correct one
-    #confidence = 0.95
-    #alpha = 1 - confidence
-    #eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
-    #epsilon = eta * stddev / np.sqrt(N)
-    #plt.fill_between(X, mean - epsilon, mean + epsilon,
-    #                 color="blue", alpha=0.25,
-    #                 label=f"{100*confidence}% confidence interval")
-
-    #####
-    # confidence interval
-    # assuming each summed distribution is a normal distribution
-    confidence = 0.999999
-    delta = 1 - confidence
-
-    # corrected sample variance
-    S = np.sqrt((1 / N-1) * (mean - mean**2))
-
-    eta = Student(df=N-1).ppf(1 - delta/2)
-    epsilon = eta * stddev / np.sqrt(N)
-    plt.fill_between(X, mean - epsilon, mean + epsilon,
-                     color="green", alpha=0.2,
-                     label=f"{100*confidence}% confidence interval")
-
-    # confidence = 0.95
-    # delta = 1 - confidence
-    # eta = Student(df=X-1).ppf(1 - delta/2)
-    # epsilon = eta * stddev / np.sqrt(X)
-    # plt.fill_between(X, mean - epsilon, mean + epsilon,
-    #                  color="green", alpha=0.5,
-    #                  label=f"{100*confidence}% confidence interval")
-
-    ######
-    ## beta distribution
-    ## confidence = 0.95
-    #delta = 1 - confidence
-    #alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
-    #beta = np.cumsum(loss_history, axis=1).mean(axis=0)
-    #epsilon = Beta.ppf(1 - delta/2, alpha, beta)
-    #plt.fill_between(X, mean - epsilon, mean + epsilon,
-    #                 color="orange", alpha=0.30,
-    #                 label=f"{100*confidence} β confidence interval")
-
-
-    ######
-    ## fluctuation interval
-    #confidence = 0.1
-    #alpha = 1-confidence
-    #k = Norm.ppf(alpha/2, loc=0, scale=1)
-    #fluctuation = k * stddev
-    #plt.fill_between(X, mean - fluctuation, mean + fluctuation,
-    #                 color="orange", alpha=0.25,
-    #                 label=f"{100*confidence}% fluctuation interval")
-
-    ######
-    ## hoeffding
-    #t = 0.9999999
-    #plt.plot(X, 2 * np.exp(-2 * t ** 2 / X),
-    #         color="red")
-
-    ######
-    ## y = 1/2
-    #plt.plot([0, mean.size], [0.5, 0.5],
-    #         color="orange", alpha=0.25)
-
-if __name__ == '__main__':
-    rankings = np.array([[1, 3, 2, 4],
-                         [3, 4, 2, 1],
-                         [1, 2, 3, 4],
-                         [1, 3, 2, 4],
-                         [2, 3, 1, 4],
-                         [1, 3, 2, 1],
-                         [2, 3, 1, 4],
-                         [2, 3, 1, 4]])
-
-    # all_orderings = find_orderings(parameter=PARAMETER,
-    #                                summed_attribute=SUMMED_ATTRIBUTE,
-    #                                criterion=CRITERION,
-    #                                length=LENGTH)
-    # # print(all_orderings)
-    # print(f"There are {len(all_orderings)} orderings in `all_orderings`")
-
-    # for _ in range(20):
-    #     dep = time()
-    #     plot_modal_losses()
-    #     print(round(time()-dep, 4))
-
-    plt.style.use('dark_background')
-
-    # HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
-    # plot_modal_losses()
-    HYPOTHESIS_ORDERING = ("bisque", "blue")
-    plot_modal_losses()
-    plt.legend()
-
-    ax = plt.gca()
-    # ax.set_ylim([0, 1])
-
-    # plt.ion()
-    plt.show()
-
-