moved into old_concentration_test.py

This commit is contained in:
Oscar Plaisant 2024-06-27 17:23:05 +02:00
parent 87ae317c31
commit 0835eff420

View File

@ -1,425 +0,0 @@
import matplotlib.pyplot as plt
from matplotlib.colors import CSS4_COLORS
import numpy as np
from scipy.stats import norm as Norm, beta as Beta, t as Student
from tprint import tprint
import orderankings as odrk
from querying import find_orderings
from kemeny_young import kendall_tau_dist, rank_aggregation
from tqdm import tqdm
from collections import Counter, defaultdict
import joblib
from functools import partial
import random
# Random number generator for the whole program
RNG = np.random.default_rng(1234)
VERBOSE = True
VERBOSE = False
################## DATA SETTINGS (parameters, hypothesis...) ###################
# """ comment this line when using the SSB dataset
# SSB dataset settings # {{{
PARAMETER = "p_color"
SUMMED_ATTRIBUTE = "lo_quantity"
# SUMMED_ATTRIBUTE = "lo_revenue"
# SUMMED_ATTRIBUTE = "lo_extendedprice"
LENGTH = 2
authorized_parameter_values = {
"p_size": tuple(map(int, range(50))),
"p_color": tuple(CSS4_COLORS.keys()),
}
AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER]
CRITERION = (
##### customer table
# "c_region",
"c_city",
# "c_nation",
##### part table
"p_category",
"p_brand",
# "p_mfgr",
# "p_color",
# "p_type",
# "p_container",
##### supplier table
"s_city",
# "s_nation",
# "s_region",
##### order date
# "D_DATE",
# "D_DATEKEY",
# "D_DATE",
# "D_DAYOFWEEK",
# "D_MONTH",
# "D_YEAR",
# "D_YEARMONTHNUM",
# "D_YEARMONTH",
# "D_DAYNUMINWEEK"
# "D_DAYNUMINMONTH",
# "D_DAYNUMINYEAR",
# "D_MONTHNUMINYEAR",
# "D_WEEKNUMINYEAR",
# "D_SELLINGSEASON",
# "D_LASTDAYINWEEKFL",
# "D_LASTDAYINMONTHFL",
# "D_HOLIDAYFL",
# "D_WEEKDAYFL",
)
HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
HYPOTHESIS_ORDERING = ("bisque", "blue")
# HYPOTHESIS_ORDERING = [2, 32]
# HYPOTHESIS_ORDERING = [30, 18]
# HYPOTHESIS_ORDERING = [37, 49, 10]
# }}}
""" # flight_delay dataset settings {{{
PARAMETER = "departure_airport"
SUMMED_ATTRIBUTE = "nb_flights"
LENGTH = 3
CRITERION = (
# "airline",
"departure_hour", # simpson's paradox ?
# "day",
# "month",
# "year",
)
GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS',
'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA',
'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK']
AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING
# Correct hypothesis for each length (so the loss converges to 0)
CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING)
CORRECT_ORDERINGS[2] = ['ATL', 'DEN']
CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD']
CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD']
CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX']
# now select the right one according to the LENGTH
CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH]
# Use the correct ordering
HYPOTHESIS_ORDERING = CORRECT_ORDERING
print(HYPOTHESIS_ORDERING)
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX']
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH]
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH]
# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX'] # interesting loss curve
assert len(HYPOTHESIS_ORDERING) == LENGTH
# }}}
# """
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
"""This loss is the the average of kendall tau distances between the truth
and each ordering."""
rankings = odrk.rankings_from_orderings(orderings)
true_ranking = odrk.rankings_from_orderings([truth])[0]
return rankings_average_loss(rankings, true_ranking)# }}}
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
length = len(rankings)
# apparently, this is what works for a good normalization
return distance / length
# return distance * 2 / (length * (length - 1))}}}
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
"""Return the kendall tau distance between the truth and the kemeny-young
aggregation of orderings"""
_, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
loss = kendall_tau_dist(
odrk.ranking_from_ordering(aggregation),
odrk.ranking_from_ordering(truth))
return loss
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
def get_loss_progression(): # {{{
grouped_orderings = find_orderings(parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
criterion=CRITERION,
length=LENGTH)
RNG.shuffle(grouped_orderings)
average_losses = []
kendal_aggregation_losses = []
for nb_considered_orderings in range(1, len(grouped_orderings)+1):
# loss as the average distance from truth to all considered orderings
considered_orderings = grouped_orderings[:nb_considered_orderings]
loss = orderings_average_loss(orderings=considered_orderings,
truth=HYPOTHESIS_ORDERING)
# loss as the distance between truth and the aggregation
kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
truth=HYPOTHESIS_ORDERING)
kendal_aggregation_losses.append(kdl_agg_loss)
if VERBOSE:
print(f"using {nb_considered_orderings} orderings")
tprint(considered_orderings)
print("truth :", HYPOTHESIS_ORDERING)
print("loss =", loss)
average_losses.append(loss)
return average_losses, kendal_aggregation_losses
# }}}
def plot_loss_progression(): # {{{
"""Plot the progression of losses when using more and more of the values
(see get_loss_progression)."""
N = 20
avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
avg_loss_progression = np.array(avg_loss_progression)
kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
for _ in tqdm(range(N-1), leave=False):
avg_lp, kmny_lp = get_loss_progression()
avg_loss_progression += avg_lp
kdl_agg_loss_progression += kmny_lp
# print(progression)
if VERBOSE:
print(avg_loss_progression)
print(kdl_agg_loss_progression)
plt.plot(avg_loss_progression, color="orange")
plt.plot(kdl_agg_loss_progression, color="green")
# }}}
def get_mode_loss_progression(all_orderings: list[list[str]],
number_of_steps: int,
orders_added_each_step: int =1) -> list[bool]:
# all_rankings = odrk.rankings_from_orderings(all_orderings)
# considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
# count occurrences of each ordering
orderings_count = Counter(map(tuple, considered_orderings))
# loss progression when adding more and more orderings
loss_history = np.zeros(number_of_steps)
# # random permutation of the orderings
# permuted_orderings = np.random.permutation(all_orderings)
for idx in range(number_of_steps):
# new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
new_orders = random.choices(all_orderings, k=orders_added_each_step)
# new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
# considered_orderings.extend(new_orders)
# update the counter of orderings occurrences
orderings_count.update(Counter(map(tuple, new_orders)))
# the most common (modal) ordering
modal_ordering = orderings_count.most_common()[0][0]
modal_ordering = np.array(modal_ordering)
# if VERBOSE: print(modal_ordering)
# the loss is 1 if the modal ordering is the same as the hypothesis
loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
# loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
# loss = int(all(map(lambda x: x[0]==x[1],
# zip(modal_ordering, HYPOTHESIS_ORDERING))))
# add loss to the list of losses
loss_history[idx] = loss
if VERBOSE:
# print(loss_history, HYPOTHESIS_ORDERING)
print(orderings_count.most_common(1)[0])
return np.repeat(loss_history, orders_added_each_step)
################################################################################
def plot_modal_losses():
###################
# sampling settings
N = 100 # number of repetitions of the experiment
max_number_of_orders = 7500 # max sample size
GRANULARITY = 12 # granularity of the sampling (orders by iteration)
number_of_steps = max_number_of_orders // GRANULARITY
all_orderings = find_orderings(
parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
criterion=CRITERION,
length=LENGTH,
authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
print(f"there are {all_orderings.size} orders in total :")
tprint(all_orderings, limit=10)
# make get_mode_loss_progression parallelizable
gmlp = joblib.delayed(get_mode_loss_progression)
####
# Aggregate multiple simulations
# don't use the tqdm progress bar if there are some logs
range_N = range(N) if VERBOSE else tqdm(range(N))
# for my 8-core computer, n_jobs=7 is empirically the best value
loss_history = joblib.Parallel(n_jobs=7)(
gmlp(all_orderings,
number_of_steps,
orders_added_each_step=GRANULARITY)
for _ in range_N
)
loss_history = np.array(loss_history)
# the sum of losses for each number of steps
losses = np.sum(loss_history, axis=0)
if VERBOSE: print("losses :", losses, sep="\n")
#####
# average
# since losses is the sum of losses, losses/N is the average
mean = losses / N
plt.plot(mean, color="green", label="loss average")
#####
# standard deviation
# variance is (average of squares) - (square of the average)
# since we only have 1 or 0, average of squares is just the average
# so the variance is average - average**2
# stddev is the square root of variance
stddev = np.sqrt(mean - mean**2)
plt.plot(stddev, color="grey", label="loss standard deviation")
############################################################################
# CONFIDENCE INTERVALS
X = np.arange(mean.size) # the x axis
######
## confidence interval
## assuming the experimental variance is the correct one
#confidence = 0.95
#alpha = 1 - confidence
#eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
#epsilon = eta * stddev / np.sqrt(N)
#plt.fill_between(X, mean - epsilon, mean + epsilon,
# color="blue", alpha=0.25,
# label=f"{100*confidence}% confidence interval")
#####
# confidence interval
# assuming each summed distribution is a normal distribution
confidence = 0.999999
delta = 1 - confidence
# corrected sample variance
S = np.sqrt((1 / N-1) * (mean - mean**2))
eta = Student(df=N-1).ppf(1 - delta/2)
epsilon = eta * stddev / np.sqrt(N)
plt.fill_between(X, mean - epsilon, mean + epsilon,
color="green", alpha=0.2,
label=f"{100*confidence}% confidence interval")
# confidence = 0.95
# delta = 1 - confidence
# eta = Student(df=X-1).ppf(1 - delta/2)
# epsilon = eta * stddev / np.sqrt(X)
# plt.fill_between(X, mean - epsilon, mean + epsilon,
# color="green", alpha=0.5,
# label=f"{100*confidence}% confidence interval")
######
## beta distribution
## confidence = 0.95
#delta = 1 - confidence
#alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
#beta = np.cumsum(loss_history, axis=1).mean(axis=0)
#epsilon = Beta.ppf(1 - delta/2, alpha, beta)
#plt.fill_between(X, mean - epsilon, mean + epsilon,
# color="orange", alpha=0.30,
# label=f"{100*confidence} β confidence interval")
######
## fluctuation interval
#confidence = 0.1
#alpha = 1-confidence
#k = Norm.ppf(alpha/2, loc=0, scale=1)
#fluctuation = k * stddev
#plt.fill_between(X, mean - fluctuation, mean + fluctuation,
# color="orange", alpha=0.25,
# label=f"{100*confidence}% fluctuation interval")
######
## hoeffding
#t = 0.9999999
#plt.plot(X, 2 * np.exp(-2 * t ** 2 / X),
# color="red")
######
## y = 1/2
#plt.plot([0, mean.size], [0.5, 0.5],
# color="orange", alpha=0.25)
if __name__ == '__main__':
rankings = np.array([[1, 3, 2, 4],
[3, 4, 2, 1],
[1, 2, 3, 4],
[1, 3, 2, 4],
[2, 3, 1, 4],
[1, 3, 2, 1],
[2, 3, 1, 4],
[2, 3, 1, 4]])
# all_orderings = find_orderings(parameter=PARAMETER,
# summed_attribute=SUMMED_ATTRIBUTE,
# criterion=CRITERION,
# length=LENGTH)
# # print(all_orderings)
# print(f"There are {len(all_orderings)} orderings in `all_orderings`")
# for _ in range(20):
# dep = time()
# plot_modal_losses()
# print(round(time()-dep, 4))
plt.style.use('dark_background')
# HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
# plot_modal_losses()
HYPOTHESIS_ORDERING = ("bisque", "blue")
plot_modal_losses()
plt.legend()
ax = plt.gca()
# ax.set_ylim([0, 1])
# plt.ion()
plt.show()