diff --git a/src/concentration_test.py b/src/concentration_test.py index f3faecd..e69de29 100644 --- a/src/concentration_test.py +++ b/src/concentration_test.py @@ -1,425 +0,0 @@ -import matplotlib.pyplot as plt -from matplotlib.colors import CSS4_COLORS -import numpy as np -from scipy.stats import norm as Norm, beta as Beta, t as Student -from tprint import tprint -import orderankings as odrk -from querying import find_orderings -from kemeny_young import kendall_tau_dist, rank_aggregation -from tqdm import tqdm -from collections import Counter, defaultdict -import joblib -from functools import partial -import random - -# Random number generator for the whole program -RNG = np.random.default_rng(1234) - -VERBOSE = True -VERBOSE = False - - -################## DATA SETTINGS (parameters, hypothesis...) ################### - -# """ comment this line when using the SSB dataset -# SSB dataset settings # {{{ - -PARAMETER = "p_color" -SUMMED_ATTRIBUTE = "lo_quantity" -# SUMMED_ATTRIBUTE = "lo_revenue" -# SUMMED_ATTRIBUTE = "lo_extendedprice" -LENGTH = 2 - -authorized_parameter_values = { - "p_size": tuple(map(int, range(50))), - "p_color": tuple(CSS4_COLORS.keys()), - } -AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER] - -CRITERION = ( - ##### customer table - # "c_region", - "c_city", - # "c_nation", - - ##### part table - "p_category", - "p_brand", - # "p_mfgr", - # "p_color", - # "p_type", - # "p_container", - - ##### supplier table - "s_city", - # "s_nation", - # "s_region", - - ##### order date - # "D_DATE", - # "D_DATEKEY", - # "D_DATE", - # "D_DAYOFWEEK", - # "D_MONTH", - # "D_YEAR", - # "D_YEARMONTHNUM", - # "D_YEARMONTH", - # "D_DAYNUMINWEEK" - # "D_DAYNUMINMONTH", - # "D_DAYNUMINYEAR", - # "D_MONTHNUMINYEAR", - # "D_WEEKNUMINYEAR", - # "D_SELLINGSEASON", - # "D_LASTDAYINWEEKFL", - # "D_LASTDAYINMONTHFL", - # "D_HOLIDAYFL", - # "D_WEEKDAYFL", - ) - -HYPOTHESIS_ORDERING = ("bisque", "aquamarine") -HYPOTHESIS_ORDERING = ("bisque", "blue") - -# HYPOTHESIS_ORDERING = [2, 32] -# HYPOTHESIS_ORDERING = [30, 18] -# HYPOTHESIS_ORDERING = [37, 49, 10] - - -# }}} -""" # flight_delay dataset settings {{{ - -PARAMETER = "departure_airport" -SUMMED_ATTRIBUTE = "nb_flights" -LENGTH = 3 - -CRITERION = ( - # "airline", - "departure_hour", # simpson's paradox ? - # "day", - # "month", - # "year", - ) - - -GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS', - 'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA', - 'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK'] -AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING - - -# Correct hypothesis for each length (so the loss converges to 0) -CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING) -CORRECT_ORDERINGS[2] = ['ATL', 'DEN'] -CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD'] -CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD'] -CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX'] -# now select the right one according to the LENGTH -CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH] - -# Use the correct ordering -HYPOTHESIS_ORDERING = CORRECT_ORDERING -print(HYPOTHESIS_ORDERING) - - -# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX'] -# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH] -# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH] -# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX'] # interesting loss curve - -assert len(HYPOTHESIS_ORDERING) == LENGTH - -# }}} -# """ - - -def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{ - """This loss is the the average of kendall tau distances between the truth - and each ordering.""" - rankings = odrk.rankings_from_orderings(orderings) - true_ranking = odrk.rankings_from_orderings([truth])[0] - return rankings_average_loss(rankings, true_ranking)# }}} - - -def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{ - distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings) - length = len(rankings) - # apparently, this is what works for a good normalization - return distance / length - # return distance * 2 / (length * (length - 1))}}} - - -def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{ - """Return the kendall tau distance between the truth and the kemeny-young - aggregation of orderings""" - _, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings)) - aggregation = odrk.ordering_from_ranking(agg_rank, truth) - loss = kendall_tau_dist( - odrk.ranking_from_ordering(aggregation), - odrk.ranking_from_ordering(truth)) - return loss - # print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}} - - -def get_loss_progression(): # {{{ - grouped_orderings = find_orderings(parameter=PARAMETER, - summed_attribute=SUMMED_ATTRIBUTE, - criterion=CRITERION, - length=LENGTH) - RNG.shuffle(grouped_orderings) - - average_losses = [] - kendal_aggregation_losses = [] - - for nb_considered_orderings in range(1, len(grouped_orderings)+1): - # loss as the average distance from truth to all considered orderings - considered_orderings = grouped_orderings[:nb_considered_orderings] - loss = orderings_average_loss(orderings=considered_orderings, - truth=HYPOTHESIS_ORDERING) - - # loss as the distance between truth and the aggregation - kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings, - truth=HYPOTHESIS_ORDERING) - kendal_aggregation_losses.append(kdl_agg_loss) - - if VERBOSE: - print(f"using {nb_considered_orderings} orderings") - tprint(considered_orderings) - print("truth :", HYPOTHESIS_ORDERING) - print("loss =", loss) - average_losses.append(loss) - return average_losses, kendal_aggregation_losses - # }}} - -def plot_loss_progression(): # {{{ - """Plot the progression of losses when using more and more of the values - (see get_loss_progression).""" - N = 20 - - avg_loss_progression, kdl_agg_loss_progression = get_loss_progression() - avg_loss_progression = np.array(avg_loss_progression) - kdl_agg_loss_progression = np.array(kdl_agg_loss_progression) - - for _ in tqdm(range(N-1), leave=False): - avg_lp, kmny_lp = get_loss_progression() - avg_loss_progression += avg_lp - kdl_agg_loss_progression += kmny_lp - # print(progression) - if VERBOSE: - print(avg_loss_progression) - print(kdl_agg_loss_progression) - plt.plot(avg_loss_progression, color="orange") - plt.plot(kdl_agg_loss_progression, color="green") - # }}} - -def get_mode_loss_progression(all_orderings: list[list[str]], - number_of_steps: int, - orders_added_each_step: int =1) -> list[bool]: - - # all_rankings = odrk.rankings_from_orderings(all_orderings) - - # considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step)) - considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step)) - # count occurrences of each ordering - orderings_count = Counter(map(tuple, considered_orderings)) - - # loss progression when adding more and more orderings - loss_history = np.zeros(number_of_steps) - - # # random permutation of the orderings - # permuted_orderings = np.random.permutation(all_orderings) - - for idx in range(number_of_steps): - # new_orders = RNG.choice(all_orderings, size=orders_added_each_step) - new_orders = random.choices(all_orderings, k=orders_added_each_step) - # new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)] - - # considered_orderings.extend(new_orders) - # update the counter of orderings occurrences - orderings_count.update(Counter(map(tuple, new_orders))) - # the most common (modal) ordering - modal_ordering = orderings_count.most_common()[0][0] - modal_ordering = np.array(modal_ordering) - # if VERBOSE: print(modal_ordering) - # the loss is 1 if the modal ordering is the same as the hypothesis - loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING)) - # loss = int((modal_ordering == HYPOTHESIS_ORDERING).all()) - # loss = int(all(map(lambda x: x[0]==x[1], - # zip(modal_ordering, HYPOTHESIS_ORDERING)))) - # add loss to the list of losses - loss_history[idx] = loss - if VERBOSE: - # print(loss_history, HYPOTHESIS_ORDERING) - print(orderings_count.most_common(1)[0]) - return np.repeat(loss_history, orders_added_each_step) - - -################################################################################ - -def plot_modal_losses(): - ################### - # sampling settings - N = 100 # number of repetitions of the experiment - max_number_of_orders = 7500 # max sample size - GRANULARITY = 12 # granularity of the sampling (orders by iteration) - - number_of_steps = max_number_of_orders // GRANULARITY - - all_orderings = find_orderings( - parameter=PARAMETER, - summed_attribute=SUMMED_ATTRIBUTE, - criterion=CRITERION, - length=LENGTH, - authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES) - - print(f"there are {all_orderings.size} orders in total :") - tprint(all_orderings, limit=10) - - - # make get_mode_loss_progression parallelizable - gmlp = joblib.delayed(get_mode_loss_progression) - - #### - # Aggregate multiple simulations - - # don't use the tqdm progress bar if there are some logs - range_N = range(N) if VERBOSE else tqdm(range(N)) - - # for my 8-core computer, n_jobs=7 is empirically the best value - loss_history = joblib.Parallel(n_jobs=7)( - gmlp(all_orderings, - number_of_steps, - orders_added_each_step=GRANULARITY) - for _ in range_N - ) - loss_history = np.array(loss_history) - - # the sum of losses for each number of steps - losses = np.sum(loss_history, axis=0) - - if VERBOSE: print("losses :", losses, sep="\n") - - ##### - # average - # since losses is the sum of losses, losses/N is the average - mean = losses / N - plt.plot(mean, color="green", label="loss average") - - ##### - # standard deviation - # variance is (average of squares) - (square of the average) - # since we only have 1 or 0, average of squares is just the average - # so the variance is average - average**2 - # stddev is the square root of variance - stddev = np.sqrt(mean - mean**2) - plt.plot(stddev, color="grey", label="loss standard deviation") - - - - ############################################################################ - # CONFIDENCE INTERVALS - - X = np.arange(mean.size) # the x axis - - ###### - ## confidence interval - ## assuming the experimental variance is the correct one - #confidence = 0.95 - #alpha = 1 - confidence - #eta = Norm.ppf(1 - alpha/2, loc=0, scale=1) - #epsilon = eta * stddev / np.sqrt(N) - #plt.fill_between(X, mean - epsilon, mean + epsilon, - # color="blue", alpha=0.25, - # label=f"{100*confidence}% confidence interval") - - ##### - # confidence interval - # assuming each summed distribution is a normal distribution - confidence = 0.999999 - delta = 1 - confidence - - # corrected sample variance - S = np.sqrt((1 / N-1) * (mean - mean**2)) - - eta = Student(df=N-1).ppf(1 - delta/2) - epsilon = eta * stddev / np.sqrt(N) - plt.fill_between(X, mean - epsilon, mean + epsilon, - color="green", alpha=0.2, - label=f"{100*confidence}% confidence interval") - - # confidence = 0.95 - # delta = 1 - confidence - # eta = Student(df=X-1).ppf(1 - delta/2) - # epsilon = eta * stddev / np.sqrt(X) - # plt.fill_between(X, mean - epsilon, mean + epsilon, - # color="green", alpha=0.5, - # label=f"{100*confidence}% confidence interval") - - ###### - ## beta distribution - ## confidence = 0.95 - #delta = 1 - confidence - #alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0) - #beta = np.cumsum(loss_history, axis=1).mean(axis=0) - #epsilon = Beta.ppf(1 - delta/2, alpha, beta) - #plt.fill_between(X, mean - epsilon, mean + epsilon, - # color="orange", alpha=0.30, - # label=f"{100*confidence} β confidence interval") - - - ###### - ## fluctuation interval - #confidence = 0.1 - #alpha = 1-confidence - #k = Norm.ppf(alpha/2, loc=0, scale=1) - #fluctuation = k * stddev - #plt.fill_between(X, mean - fluctuation, mean + fluctuation, - # color="orange", alpha=0.25, - # label=f"{100*confidence}% fluctuation interval") - - ###### - ## hoeffding - #t = 0.9999999 - #plt.plot(X, 2 * np.exp(-2 * t ** 2 / X), - # color="red") - - ###### - ## y = 1/2 - #plt.plot([0, mean.size], [0.5, 0.5], - # color="orange", alpha=0.25) - -if __name__ == '__main__': - rankings = np.array([[1, 3, 2, 4], - [3, 4, 2, 1], - [1, 2, 3, 4], - [1, 3, 2, 4], - [2, 3, 1, 4], - [1, 3, 2, 1], - [2, 3, 1, 4], - [2, 3, 1, 4]]) - - # all_orderings = find_orderings(parameter=PARAMETER, - # summed_attribute=SUMMED_ATTRIBUTE, - # criterion=CRITERION, - # length=LENGTH) - # # print(all_orderings) - # print(f"There are {len(all_orderings)} orderings in `all_orderings`") - - # for _ in range(20): - # dep = time() - # plot_modal_losses() - # print(round(time()-dep, 4)) - - plt.style.use('dark_background') - - # HYPOTHESIS_ORDERING = ("bisque", "aquamarine") - # plot_modal_losses() - HYPOTHESIS_ORDERING = ("bisque", "blue") - plot_modal_losses() - plt.legend() - - ax = plt.gca() - # ax.set_ylim([0, 1]) - - # plt.ion() - plt.show() - -