move concentration_test.py to old_concentration_test.py.

gitignore the `__pycache__`.
2024-06-27 17:14:56 +02:00 · 2024-06-27 17:14:56 +02:00 · 292510f409
commit 292510f409
parent fc6a95334d
11 changed files with 342 additions and 0 deletions
--- a/old_concentration_test.py
+++ b/old_concentration_test.py
@ -0,0 +1,342 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.stats import norm as Norm, beta as Beta, t as Student
+from tprint import tprint
+import orderankings as odrk
+from querying import find_orderings
+from kemeny_young import kendall_tau_dist, rank_aggregation
+from tqdm import tqdm
+from collections import Counter, defaultdict
+import joblib
+from functools import partial
+import random
+import yaml
+
+# Random number generator for the whole program
+# RNG = np.random.default_rng(1234)
+
+
+
+######################## YAML CONFIG (src/config.yaml) #########################
+with open('src/config.yaml') as config_file:
+    cfg = yaml.load(config_file, Loader=yaml.Loader)
+
+DATABASE_NAME = cfg["database_name"]
+
+
+VERBOSE = cfg["verbose"]["concentration_test"]
+
+
+################## DATA SETTINGS (parameters, hypothesis...) ###################
+# loaded from src/config.yaml
+
+PARAMETER = tuple(cfg[DATABASE_NAME]["parameter"])
+SUMMED_ATTRIBUTE = tuple(cfg[DATABASE_NAME]["summmed_attribute"])
+# SUMMED_ATTRIBUTE = "lo_revenue"
+# SUMMED_ATTRIBUTE = "lo_extendedprice"
+LENGTH = cfg[DATABASE_NAME]["orders_length"]
+
+AUTHORIZED_PARAMETER_VALUES = tuple(cfg[DATABASE_NAME]["authorized_parameter_values"])
+
+CRITERION = tuple(cfg[DATABASE_NAME]["criterion"])
+
+HYPOTHESIS_ORDERING = tuple(cfg[DATABASE_NAME]["hypothesis_ordering"])
+
+assert len(HYPOTHESIS_ORDERING) == LENGTH
+
+################################ LOSS FUNCTIONS ################################
+
+def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
+    """This loss is the the average of kendall tau distances between the truth
+    and each ordering."""
+    rankings = odrk.rankings_from_orderings(orderings)
+    true_ranking = odrk.rankings_from_orderings([truth])[0]
+    return rankings_average_loss(rankings, true_ranking)# }}}
+
+
+def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
+    distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
+    length = len(rankings)
+    # apparently, this is what works for a good normalization
+    return distance / length
+    # return distance * 2 / (length * (length - 1))}}}
+
+
+def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
+    """Return the kendall tau distance between the truth and the kemeny-young
+    aggregation of orderings"""
+    _, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
+    aggregation = odrk.ordering_from_ranking(agg_rank, truth)
+    loss = kendall_tau_dist(
+            odrk.ranking_from_ordering(aggregation),
+            odrk.ranking_from_ordering(truth))
+    return loss
+    # print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
+
+
+def get_loss_progression(): # {{{
+    grouped_orderings = find_orderings(parameter=PARAMETER,
+                                        summed_attribute=SUMMED_ATTRIBUTE,
+                                        criterion=CRITERION,
+                                        length=LENGTH)
+    RNG.shuffle(grouped_orderings)
+
+    average_losses = []
+    kendal_aggregation_losses = []
+
+    for nb_considered_orderings in range(1, len(grouped_orderings)+1):
+        # loss as the average distance from truth to all considered orderings
+        considered_orderings = grouped_orderings[:nb_considered_orderings]
+        loss = orderings_average_loss(orderings=considered_orderings,
+                                      truth=HYPOTHESIS_ORDERING)
+
+        # loss as the distance between truth and the aggregation
+        kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
+                                      truth=HYPOTHESIS_ORDERING)
+        kendal_aggregation_losses.append(kdl_agg_loss)
+
+        if VERBOSE:
+            print(f"using {nb_considered_orderings} orderings")
+            tprint(considered_orderings)
+            print("truth :", HYPOTHESIS_ORDERING)
+            print("loss =", loss)
+        average_losses.append(loss)
+    return average_losses, kendal_aggregation_losses
+    # }}}
+
+################## APPLIED ON SAMPLES FOR CONCENTRATION TESTS ##################
+
+def plot_loss_progression(): # {{{
+    """Plot the progression of losses when using more and more of the values
+    (see get_loss_progression)."""
+    N = 20
+
+    avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
+    avg_loss_progression = np.array(avg_loss_progression)
+    kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
+
+    for _ in tqdm(range(N-1), leave=False):
+        avg_lp, kmny_lp = get_loss_progression()
+        avg_loss_progression += avg_lp
+        kdl_agg_loss_progression += kmny_lp
+        # print(progression)
+    if VERBOSE:
+        print(avg_loss_progression)
+        print(kdl_agg_loss_progression)
+    plt.plot(avg_loss_progression, color="orange")
+    plt.plot(kdl_agg_loss_progression, color="green")
+    # }}}
+
+def get_mode_loss_progression(all_orderings: list[list[str]],
+                              number_of_steps: int,
+                              orders_added_each_step: int =1) -> list[bool]:
+
+    # all_rankings = odrk.rankings_from_orderings(all_orderings)
+
+    # considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
+    considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
+    # count occurrences of each ordering
+    orderings_count = Counter(map(tuple, considered_orderings))
+
+    # loss progression when adding more and more orderings
+    loss_history = np.zeros(number_of_steps)
+
+    # # random permutation of the orderings
+    # permuted_orderings = np.random.permutation(all_orderings)
+
+    for idx in range(number_of_steps):
+        # new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
+        new_orders = random.choices(all_orderings, k=orders_added_each_step)
+        # new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
+
+        # considered_orderings.extend(new_orders)
+        # update the counter of orderings occurrences
+        orderings_count.update(Counter(map(tuple, new_orders)))
+        # the most common (modal) ordering
+        modal_ordering = orderings_count.most_common()[0][0]
+        modal_ordering = np.array(modal_ordering)
+        # if VERBOSE: print(modal_ordering)
+        # the loss is 1 if the modal ordering is the same as the hypothesis
+        loss =  int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
+        # loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
+        # loss = int(all(map(lambda x: x[0]==x[1],
+        #                    zip(modal_ordering, HYPOTHESIS_ORDERING))))
+        # add loss to the list of losses
+        loss_history[idx] = loss
+    if VERBOSE:
+        # print(loss_history, HYPOTHESIS_ORDERING)
+        print(orderings_count.most_common(1)[0])
+    return np.repeat(loss_history, orders_added_each_step)
+
+
+################################################################################
+
+def plot_modal_losses():
+    ###################
+    # sampling settings
+    N = 100  # number of repetitions of the experiment
+    max_number_of_orders = 7500  # max sample size
+    GRANULARITY = 12  # granularity of the sampling (orders by iteration)
+
+    number_of_steps = max_number_of_orders // GRANULARITY
+
+    all_orderings = find_orderings(
+            parameter=PARAMETER,
+            summed_attribute=SUMMED_ATTRIBUTE,
+            criterion=CRITERION,
+            length=LENGTH,
+            authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
+
+    print(f"there are {all_orderings.size} orders in total :")
+    tprint(all_orderings, limit=10)
+
+
+    # make get_mode_loss_progression parallelizable
+    gmlp = joblib.delayed(get_mode_loss_progression)
+
+    ####
+    # Aggregate multiple simulations
+
+    # don't use the tqdm progress bar if there are some logs
+    range_N = range(N) if VERBOSE else tqdm(range(N))
+
+    # for my 8-core computer, n_jobs=7 is empirically the best value
+    loss_history = joblib.Parallel(n_jobs=7)(
+            gmlp(all_orderings,
+                 number_of_steps,
+                 orders_added_each_step=GRANULARITY)
+            for _ in range_N
+            )
+    loss_history = np.array(loss_history)
+
+    # the sum of losses for each number of steps
+    losses = np.sum(loss_history, axis=0)
+
+    if VERBOSE: print("losses :", losses, sep="\n")
+
+    #####
+    # average
+    # since losses is the sum of losses, losses/N is the average
+    mean = losses / N
+    plt.plot(mean, color="green", label="loss average")
+
+    #####
+    # standard deviation
+    # variance is (average of squares) - (square of the average)
+    # since we only have 1 or 0, average of squares is just the average
+    # so the variance is average - average**2
+    # stddev is the square root of variance
+    stddev = np.sqrt(mean - mean**2)
+    plt.plot(stddev, color="grey", label="loss standard deviation")
+
+
+
+    ############################################################################
+    # CONFIDENCE INTERVALS
+
+    X = np.arange(mean.size)  # the x axis
+
+    ######
+    ## confidence interval
+    ## assuming the experimental variance is the correct one
+    #confidence = 0.95
+    #alpha = 1 - confidence
+    #eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
+    #epsilon = eta * stddev / np.sqrt(N)
+    #plt.fill_between(X, mean - epsilon, mean + epsilon,
+    #                 color="blue", alpha=0.25,
+    #                 label=f"{100*confidence}% confidence interval")
+
+    #####
+    # confidence interval
+    # assuming each summed distribution is a normal distribution
+    confidence = 0.999999
+    delta = 1 - confidence
+
+    # corrected sample variance
+    S = np.sqrt((1 / N-1) * (mean - mean**2))
+
+    eta = Student(df=N-1).ppf(1 - delta/2)
+    epsilon = eta * stddev / np.sqrt(N)
+    plt.fill_between(X, mean - epsilon, mean + epsilon,
+                     color="green", alpha=0.2,
+                     label=f"{100*confidence}% confidence interval")
+
+    # confidence = 0.95
+    # delta = 1 - confidence
+    # eta = Student(df=X-1).ppf(1 - delta/2)
+    # epsilon = eta * stddev / np.sqrt(X)
+    # plt.fill_between(X, mean - epsilon, mean + epsilon,
+    #                  color="green", alpha=0.5,
+    #                  label=f"{100*confidence}% confidence interval")
+
+    ######
+    ## beta distribution
+    ## confidence = 0.95
+    #delta = 1 - confidence
+    #alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
+    #beta = np.cumsum(loss_history, axis=1).mean(axis=0)
+    #epsilon = Beta.ppf(1 - delta/2, alpha, beta)
+    #plt.fill_between(X, mean - epsilon, mean + epsilon,
+    #                 color="orange", alpha=0.30,
+    #                 label=f"{100*confidence} β confidence interval")
+
+
+    ######
+    ## fluctuation interval
+    #confidence = 0.1
+    #alpha = 1-confidence
+    #k = Norm.ppf(alpha/2, loc=0, scale=1)
+    #fluctuation = k * stddev
+    #plt.fill_between(X, mean - fluctuation, mean + fluctuation,
+    #                 color="orange", alpha=0.25,
+    #                 label=f"{100*confidence}% fluctuation interval")
+
+    #####
+    # hoeffding
+    t = 0.99
+    plt.plot(X, np.exp(-2 * t ** 2 / X),
+             color="red")
+
+    ######
+    ## y = 1/2
+    #plt.plot([0, mean.size], [0.5, 0.5],
+    #         color="orange", alpha=0.25)
+
+if __name__ == '__main__':
+    rankings = np.array([[1, 3, 2, 4],
+                         [3, 4, 2, 1],
+                         [1, 2, 3, 4],
+                         [1, 3, 2, 4],
+                         [2, 3, 1, 4],
+                         [1, 3, 2, 1],
+                         [2, 3, 1, 4],
+                         [2, 3, 1, 4]])
+
+    # all_orderings = find_orderings(parameter=PARAMETER,
+    #                                summed_attribute=SUMMED_ATTRIBUTE,
+    #                                criterion=CRITERION,
+    #                                length=LENGTH)
+    # # print(all_orderings)
+    # print(f"There are {len(all_orderings)} orderings in `all_orderings`")
+
+    # for _ in range(20):
+    #     dep = time()
+    #     plot_modal_losses()
+    #     print(round(time()-dep, 4))
+
+    plt.style.use('dark_background')
+
+    # HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
+    # plot_modal_losses()
+    HYPOTHESIS_ORDERING = ("bisque", "blue")
+    plot_modal_losses()
+    plt.legend()
+
+    ax = plt.gca()
+    # ax.set_ylim([0, 1])
+
+    # plt.ion()
+    plt.show()
+
+
--- a/src/pycache/concentration_test.cpython-312.pyc
+++ b/src/pycache/concentration_test.cpython-312.pyc
--- a/src/pycache/data.cpython-312.pyc
+++ b/src/pycache/data.cpython-312.pyc
--- a/src/pycache/kemeny_young.__tau-38.py312.1.nbc
+++ b/src/pycache/kemeny_young.__tau-38.py312.1.nbc
--- a/src/pycache/kemeny_young.__tau-38.py312.nbi
+++ b/src/pycache/kemeny_young.__tau-38.py312.nbi
--- a/src/pycache/kemeny_young.cpython-312.pyc
+++ b/src/pycache/kemeny_young.cpython-312.pyc
--- a/src/pycache/orderankings.cpython-312.pyc
+++ b/src/pycache/orderankings.cpython-312.pyc
--- a/src/pycache/query_generator.cpython-312.pyc
+++ b/src/pycache/query_generator.cpython-312.pyc
--- a/src/pycache/querying.cpython-312.pyc
+++ b/src/pycache/querying.cpython-312.pyc
--- a/src/pycache/tools.cpython-312.pyc
+++ b/src/pycache/tools.cpython-312.pyc
--- a/src/pycache/tprint.cpython-312.pyc
+++ b/src/pycache/tprint.cpython-312.pyc