diff --git a/old_concentration_test.py b/old_concentration_test.py new file mode 100644 index 0000000..b06a938 --- /dev/null +++ b/old_concentration_test.py @@ -0,0 +1,342 @@ +import matplotlib.pyplot as plt +import numpy as np +from scipy.stats import norm as Norm, beta as Beta, t as Student +from tprint import tprint +import orderankings as odrk +from querying import find_orderings +from kemeny_young import kendall_tau_dist, rank_aggregation +from tqdm import tqdm +from collections import Counter, defaultdict +import joblib +from functools import partial +import random +import yaml + +# Random number generator for the whole program +# RNG = np.random.default_rng(1234) + + + +######################## YAML CONFIG (src/config.yaml) ######################### +with open('src/config.yaml') as config_file: + cfg = yaml.load(config_file, Loader=yaml.Loader) + +DATABASE_NAME = cfg["database_name"] + + +VERBOSE = cfg["verbose"]["concentration_test"] + + +################## DATA SETTINGS (parameters, hypothesis...) ################### +# loaded from src/config.yaml + +PARAMETER = tuple(cfg[DATABASE_NAME]["parameter"]) +SUMMED_ATTRIBUTE = tuple(cfg[DATABASE_NAME]["summmed_attribute"]) +# SUMMED_ATTRIBUTE = "lo_revenue" +# SUMMED_ATTRIBUTE = "lo_extendedprice" +LENGTH = cfg[DATABASE_NAME]["orders_length"] + +AUTHORIZED_PARAMETER_VALUES = tuple(cfg[DATABASE_NAME]["authorized_parameter_values"]) + +CRITERION = tuple(cfg[DATABASE_NAME]["criterion"]) + +HYPOTHESIS_ORDERING = tuple(cfg[DATABASE_NAME]["hypothesis_ordering"]) + +assert len(HYPOTHESIS_ORDERING) == LENGTH + +################################ LOSS FUNCTIONS ################################ + +def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{ + """This loss is the the average of kendall tau distances between the truth + and each ordering.""" + rankings = odrk.rankings_from_orderings(orderings) + true_ranking = odrk.rankings_from_orderings([truth])[0] + return rankings_average_loss(rankings, true_ranking)# }}} + + +def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{ + distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings) + length = len(rankings) + # apparently, this is what works for a good normalization + return distance / length + # return distance * 2 / (length * (length - 1))}}} + + +def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{ + """Return the kendall tau distance between the truth and the kemeny-young + aggregation of orderings""" + _, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings)) + aggregation = odrk.ordering_from_ranking(agg_rank, truth) + loss = kendall_tau_dist( + odrk.ranking_from_ordering(aggregation), + odrk.ranking_from_ordering(truth)) + return loss + # print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}} + + +def get_loss_progression(): # {{{ + grouped_orderings = find_orderings(parameter=PARAMETER, + summed_attribute=SUMMED_ATTRIBUTE, + criterion=CRITERION, + length=LENGTH) + RNG.shuffle(grouped_orderings) + + average_losses = [] + kendal_aggregation_losses = [] + + for nb_considered_orderings in range(1, len(grouped_orderings)+1): + # loss as the average distance from truth to all considered orderings + considered_orderings = grouped_orderings[:nb_considered_orderings] + loss = orderings_average_loss(orderings=considered_orderings, + truth=HYPOTHESIS_ORDERING) + + # loss as the distance between truth and the aggregation + kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings, + truth=HYPOTHESIS_ORDERING) + kendal_aggregation_losses.append(kdl_agg_loss) + + if VERBOSE: + print(f"using {nb_considered_orderings} orderings") + tprint(considered_orderings) + print("truth :", HYPOTHESIS_ORDERING) + print("loss =", loss) + average_losses.append(loss) + return average_losses, kendal_aggregation_losses + # }}} + +################## APPLIED ON SAMPLES FOR CONCENTRATION TESTS ################## + +def plot_loss_progression(): # {{{ + """Plot the progression of losses when using more and more of the values + (see get_loss_progression).""" + N = 20 + + avg_loss_progression, kdl_agg_loss_progression = get_loss_progression() + avg_loss_progression = np.array(avg_loss_progression) + kdl_agg_loss_progression = np.array(kdl_agg_loss_progression) + + for _ in tqdm(range(N-1), leave=False): + avg_lp, kmny_lp = get_loss_progression() + avg_loss_progression += avg_lp + kdl_agg_loss_progression += kmny_lp + # print(progression) + if VERBOSE: + print(avg_loss_progression) + print(kdl_agg_loss_progression) + plt.plot(avg_loss_progression, color="orange") + plt.plot(kdl_agg_loss_progression, color="green") + # }}} + +def get_mode_loss_progression(all_orderings: list[list[str]], + number_of_steps: int, + orders_added_each_step: int =1) -> list[bool]: + + # all_rankings = odrk.rankings_from_orderings(all_orderings) + + # considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step)) + considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step)) + # count occurrences of each ordering + orderings_count = Counter(map(tuple, considered_orderings)) + + # loss progression when adding more and more orderings + loss_history = np.zeros(number_of_steps) + + # # random permutation of the orderings + # permuted_orderings = np.random.permutation(all_orderings) + + for idx in range(number_of_steps): + # new_orders = RNG.choice(all_orderings, size=orders_added_each_step) + new_orders = random.choices(all_orderings, k=orders_added_each_step) + # new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)] + + # considered_orderings.extend(new_orders) + # update the counter of orderings occurrences + orderings_count.update(Counter(map(tuple, new_orders))) + # the most common (modal) ordering + modal_ordering = orderings_count.most_common()[0][0] + modal_ordering = np.array(modal_ordering) + # if VERBOSE: print(modal_ordering) + # the loss is 1 if the modal ordering is the same as the hypothesis + loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING)) + # loss = int((modal_ordering == HYPOTHESIS_ORDERING).all()) + # loss = int(all(map(lambda x: x[0]==x[1], + # zip(modal_ordering, HYPOTHESIS_ORDERING)))) + # add loss to the list of losses + loss_history[idx] = loss + if VERBOSE: + # print(loss_history, HYPOTHESIS_ORDERING) + print(orderings_count.most_common(1)[0]) + return np.repeat(loss_history, orders_added_each_step) + + +################################################################################ + +def plot_modal_losses(): + ################### + # sampling settings + N = 100 # number of repetitions of the experiment + max_number_of_orders = 7500 # max sample size + GRANULARITY = 12 # granularity of the sampling (orders by iteration) + + number_of_steps = max_number_of_orders // GRANULARITY + + all_orderings = find_orderings( + parameter=PARAMETER, + summed_attribute=SUMMED_ATTRIBUTE, + criterion=CRITERION, + length=LENGTH, + authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES) + + print(f"there are {all_orderings.size} orders in total :") + tprint(all_orderings, limit=10) + + + # make get_mode_loss_progression parallelizable + gmlp = joblib.delayed(get_mode_loss_progression) + + #### + # Aggregate multiple simulations + + # don't use the tqdm progress bar if there are some logs + range_N = range(N) if VERBOSE else tqdm(range(N)) + + # for my 8-core computer, n_jobs=7 is empirically the best value + loss_history = joblib.Parallel(n_jobs=7)( + gmlp(all_orderings, + number_of_steps, + orders_added_each_step=GRANULARITY) + for _ in range_N + ) + loss_history = np.array(loss_history) + + # the sum of losses for each number of steps + losses = np.sum(loss_history, axis=0) + + if VERBOSE: print("losses :", losses, sep="\n") + + ##### + # average + # since losses is the sum of losses, losses/N is the average + mean = losses / N + plt.plot(mean, color="green", label="loss average") + + ##### + # standard deviation + # variance is (average of squares) - (square of the average) + # since we only have 1 or 0, average of squares is just the average + # so the variance is average - average**2 + # stddev is the square root of variance + stddev = np.sqrt(mean - mean**2) + plt.plot(stddev, color="grey", label="loss standard deviation") + + + + ############################################################################ + # CONFIDENCE INTERVALS + + X = np.arange(mean.size) # the x axis + + ###### + ## confidence interval + ## assuming the experimental variance is the correct one + #confidence = 0.95 + #alpha = 1 - confidence + #eta = Norm.ppf(1 - alpha/2, loc=0, scale=1) + #epsilon = eta * stddev / np.sqrt(N) + #plt.fill_between(X, mean - epsilon, mean + epsilon, + # color="blue", alpha=0.25, + # label=f"{100*confidence}% confidence interval") + + ##### + # confidence interval + # assuming each summed distribution is a normal distribution + confidence = 0.999999 + delta = 1 - confidence + + # corrected sample variance + S = np.sqrt((1 / N-1) * (mean - mean**2)) + + eta = Student(df=N-1).ppf(1 - delta/2) + epsilon = eta * stddev / np.sqrt(N) + plt.fill_between(X, mean - epsilon, mean + epsilon, + color="green", alpha=0.2, + label=f"{100*confidence}% confidence interval") + + # confidence = 0.95 + # delta = 1 - confidence + # eta = Student(df=X-1).ppf(1 - delta/2) + # epsilon = eta * stddev / np.sqrt(X) + # plt.fill_between(X, mean - epsilon, mean + epsilon, + # color="green", alpha=0.5, + # label=f"{100*confidence}% confidence interval") + + ###### + ## beta distribution + ## confidence = 0.95 + #delta = 1 - confidence + #alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0) + #beta = np.cumsum(loss_history, axis=1).mean(axis=0) + #epsilon = Beta.ppf(1 - delta/2, alpha, beta) + #plt.fill_between(X, mean - epsilon, mean + epsilon, + # color="orange", alpha=0.30, + # label=f"{100*confidence} β confidence interval") + + + ###### + ## fluctuation interval + #confidence = 0.1 + #alpha = 1-confidence + #k = Norm.ppf(alpha/2, loc=0, scale=1) + #fluctuation = k * stddev + #plt.fill_between(X, mean - fluctuation, mean + fluctuation, + # color="orange", alpha=0.25, + # label=f"{100*confidence}% fluctuation interval") + + ##### + # hoeffding + t = 0.99 + plt.plot(X, np.exp(-2 * t ** 2 / X), + color="red") + + ###### + ## y = 1/2 + #plt.plot([0, mean.size], [0.5, 0.5], + # color="orange", alpha=0.25) + +if __name__ == '__main__': + rankings = np.array([[1, 3, 2, 4], + [3, 4, 2, 1], + [1, 2, 3, 4], + [1, 3, 2, 4], + [2, 3, 1, 4], + [1, 3, 2, 1], + [2, 3, 1, 4], + [2, 3, 1, 4]]) + + # all_orderings = find_orderings(parameter=PARAMETER, + # summed_attribute=SUMMED_ATTRIBUTE, + # criterion=CRITERION, + # length=LENGTH) + # # print(all_orderings) + # print(f"There are {len(all_orderings)} orderings in `all_orderings`") + + # for _ in range(20): + # dep = time() + # plot_modal_losses() + # print(round(time()-dep, 4)) + + plt.style.use('dark_background') + + # HYPOTHESIS_ORDERING = ("bisque", "aquamarine") + # plot_modal_losses() + HYPOTHESIS_ORDERING = ("bisque", "blue") + plot_modal_losses() + plt.legend() + + ax = plt.gca() + # ax.set_ylim([0, 1]) + + # plt.ion() + plt.show() + + diff --git a/src/__pycache__/concentration_test.cpython-312.pyc b/src/__pycache__/concentration_test.cpython-312.pyc deleted file mode 100644 index 8ae3b2e..0000000 Binary files a/src/__pycache__/concentration_test.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/data.cpython-312.pyc b/src/__pycache__/data.cpython-312.pyc deleted file mode 100644 index 997dfbb..0000000 Binary files a/src/__pycache__/data.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/kemeny_young.__tau-38.py312.1.nbc b/src/__pycache__/kemeny_young.__tau-38.py312.1.nbc deleted file mode 100644 index 804ca74..0000000 Binary files a/src/__pycache__/kemeny_young.__tau-38.py312.1.nbc and /dev/null differ diff --git a/src/__pycache__/kemeny_young.__tau-38.py312.nbi b/src/__pycache__/kemeny_young.__tau-38.py312.nbi deleted file mode 100644 index e5b785a..0000000 Binary files a/src/__pycache__/kemeny_young.__tau-38.py312.nbi and /dev/null differ diff --git a/src/__pycache__/kemeny_young.cpython-312.pyc b/src/__pycache__/kemeny_young.cpython-312.pyc deleted file mode 100644 index 8a3913d..0000000 Binary files a/src/__pycache__/kemeny_young.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/orderankings.cpython-312.pyc b/src/__pycache__/orderankings.cpython-312.pyc deleted file mode 100644 index 643781d..0000000 Binary files a/src/__pycache__/orderankings.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/query_generator.cpython-312.pyc b/src/__pycache__/query_generator.cpython-312.pyc deleted file mode 100644 index 256921c..0000000 Binary files a/src/__pycache__/query_generator.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/querying.cpython-312.pyc b/src/__pycache__/querying.cpython-312.pyc deleted file mode 100644 index 90ca214..0000000 Binary files a/src/__pycache__/querying.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/tools.cpython-312.pyc b/src/__pycache__/tools.cpython-312.pyc deleted file mode 100644 index bbb5fd8..0000000 Binary files a/src/__pycache__/tools.cpython-312.pyc and /dev/null differ diff --git a/src/__pycache__/tprint.cpython-312.pyc b/src/__pycache__/tprint.cpython-312.pyc deleted file mode 100644 index b8f2229..0000000 Binary files a/src/__pycache__/tprint.cpython-312.pyc and /dev/null differ