move concentration_test.py
to old_concentration_test.py
.
gitignore the `__pycache__`.
This commit is contained in:
parent
fc6a95334d
commit
292510f409
342
old_concentration_test.py
Normal file
342
old_concentration_test.py
Normal file
@ -0,0 +1,342 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from scipy.stats import norm as Norm, beta as Beta, t as Student
|
||||
from tprint import tprint
|
||||
import orderankings as odrk
|
||||
from querying import find_orderings
|
||||
from kemeny_young import kendall_tau_dist, rank_aggregation
|
||||
from tqdm import tqdm
|
||||
from collections import Counter, defaultdict
|
||||
import joblib
|
||||
from functools import partial
|
||||
import random
|
||||
import yaml
|
||||
|
||||
# Random number generator for the whole program
|
||||
# RNG = np.random.default_rng(1234)
|
||||
|
||||
|
||||
|
||||
######################## YAML CONFIG (src/config.yaml) #########################
|
||||
with open('src/config.yaml') as config_file:
|
||||
cfg = yaml.load(config_file, Loader=yaml.Loader)
|
||||
|
||||
DATABASE_NAME = cfg["database_name"]
|
||||
|
||||
|
||||
VERBOSE = cfg["verbose"]["concentration_test"]
|
||||
|
||||
|
||||
################## DATA SETTINGS (parameters, hypothesis...) ###################
|
||||
# loaded from src/config.yaml
|
||||
|
||||
PARAMETER = tuple(cfg[DATABASE_NAME]["parameter"])
|
||||
SUMMED_ATTRIBUTE = tuple(cfg[DATABASE_NAME]["summmed_attribute"])
|
||||
# SUMMED_ATTRIBUTE = "lo_revenue"
|
||||
# SUMMED_ATTRIBUTE = "lo_extendedprice"
|
||||
LENGTH = cfg[DATABASE_NAME]["orders_length"]
|
||||
|
||||
AUTHORIZED_PARAMETER_VALUES = tuple(cfg[DATABASE_NAME]["authorized_parameter_values"])
|
||||
|
||||
CRITERION = tuple(cfg[DATABASE_NAME]["criterion"])
|
||||
|
||||
HYPOTHESIS_ORDERING = tuple(cfg[DATABASE_NAME]["hypothesis_ordering"])
|
||||
|
||||
assert len(HYPOTHESIS_ORDERING) == LENGTH
|
||||
|
||||
################################ LOSS FUNCTIONS ################################
|
||||
|
||||
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
|
||||
"""This loss is the the average of kendall tau distances between the truth
|
||||
and each ordering."""
|
||||
rankings = odrk.rankings_from_orderings(orderings)
|
||||
true_ranking = odrk.rankings_from_orderings([truth])[0]
|
||||
return rankings_average_loss(rankings, true_ranking)# }}}
|
||||
|
||||
|
||||
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
|
||||
distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
|
||||
length = len(rankings)
|
||||
# apparently, this is what works for a good normalization
|
||||
return distance / length
|
||||
# return distance * 2 / (length * (length - 1))}}}
|
||||
|
||||
|
||||
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
|
||||
"""Return the kendall tau distance between the truth and the kemeny-young
|
||||
aggregation of orderings"""
|
||||
_, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
|
||||
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
|
||||
loss = kendall_tau_dist(
|
||||
odrk.ranking_from_ordering(aggregation),
|
||||
odrk.ranking_from_ordering(truth))
|
||||
return loss
|
||||
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
|
||||
|
||||
|
||||
def get_loss_progression(): # {{{
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH)
|
||||
RNG.shuffle(grouped_orderings)
|
||||
|
||||
average_losses = []
|
||||
kendal_aggregation_losses = []
|
||||
|
||||
for nb_considered_orderings in range(1, len(grouped_orderings)+1):
|
||||
# loss as the average distance from truth to all considered orderings
|
||||
considered_orderings = grouped_orderings[:nb_considered_orderings]
|
||||
loss = orderings_average_loss(orderings=considered_orderings,
|
||||
truth=HYPOTHESIS_ORDERING)
|
||||
|
||||
# loss as the distance between truth and the aggregation
|
||||
kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
|
||||
truth=HYPOTHESIS_ORDERING)
|
||||
kendal_aggregation_losses.append(kdl_agg_loss)
|
||||
|
||||
if VERBOSE:
|
||||
print(f"using {nb_considered_orderings} orderings")
|
||||
tprint(considered_orderings)
|
||||
print("truth :", HYPOTHESIS_ORDERING)
|
||||
print("loss =", loss)
|
||||
average_losses.append(loss)
|
||||
return average_losses, kendal_aggregation_losses
|
||||
# }}}
|
||||
|
||||
################## APPLIED ON SAMPLES FOR CONCENTRATION TESTS ##################
|
||||
|
||||
def plot_loss_progression(): # {{{
|
||||
"""Plot the progression of losses when using more and more of the values
|
||||
(see get_loss_progression)."""
|
||||
N = 20
|
||||
|
||||
avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
|
||||
avg_loss_progression = np.array(avg_loss_progression)
|
||||
kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
|
||||
|
||||
for _ in tqdm(range(N-1), leave=False):
|
||||
avg_lp, kmny_lp = get_loss_progression()
|
||||
avg_loss_progression += avg_lp
|
||||
kdl_agg_loss_progression += kmny_lp
|
||||
# print(progression)
|
||||
if VERBOSE:
|
||||
print(avg_loss_progression)
|
||||
print(kdl_agg_loss_progression)
|
||||
plt.plot(avg_loss_progression, color="orange")
|
||||
plt.plot(kdl_agg_loss_progression, color="green")
|
||||
# }}}
|
||||
|
||||
def get_mode_loss_progression(all_orderings: list[list[str]],
|
||||
number_of_steps: int,
|
||||
orders_added_each_step: int =1) -> list[bool]:
|
||||
|
||||
# all_rankings = odrk.rankings_from_orderings(all_orderings)
|
||||
|
||||
# considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
|
||||
considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
|
||||
# count occurrences of each ordering
|
||||
orderings_count = Counter(map(tuple, considered_orderings))
|
||||
|
||||
# loss progression when adding more and more orderings
|
||||
loss_history = np.zeros(number_of_steps)
|
||||
|
||||
# # random permutation of the orderings
|
||||
# permuted_orderings = np.random.permutation(all_orderings)
|
||||
|
||||
for idx in range(number_of_steps):
|
||||
# new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
|
||||
new_orders = random.choices(all_orderings, k=orders_added_each_step)
|
||||
# new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
|
||||
|
||||
# considered_orderings.extend(new_orders)
|
||||
# update the counter of orderings occurrences
|
||||
orderings_count.update(Counter(map(tuple, new_orders)))
|
||||
# the most common (modal) ordering
|
||||
modal_ordering = orderings_count.most_common()[0][0]
|
||||
modal_ordering = np.array(modal_ordering)
|
||||
# if VERBOSE: print(modal_ordering)
|
||||
# the loss is 1 if the modal ordering is the same as the hypothesis
|
||||
loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
|
||||
# loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
|
||||
# loss = int(all(map(lambda x: x[0]==x[1],
|
||||
# zip(modal_ordering, HYPOTHESIS_ORDERING))))
|
||||
# add loss to the list of losses
|
||||
loss_history[idx] = loss
|
||||
if VERBOSE:
|
||||
# print(loss_history, HYPOTHESIS_ORDERING)
|
||||
print(orderings_count.most_common(1)[0])
|
||||
return np.repeat(loss_history, orders_added_each_step)
|
||||
|
||||
|
||||
################################################################################
|
||||
|
||||
def plot_modal_losses():
|
||||
###################
|
||||
# sampling settings
|
||||
N = 100 # number of repetitions of the experiment
|
||||
max_number_of_orders = 7500 # max sample size
|
||||
GRANULARITY = 12 # granularity of the sampling (orders by iteration)
|
||||
|
||||
number_of_steps = max_number_of_orders // GRANULARITY
|
||||
|
||||
all_orderings = find_orderings(
|
||||
parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH,
|
||||
authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
|
||||
|
||||
print(f"there are {all_orderings.size} orders in total :")
|
||||
tprint(all_orderings, limit=10)
|
||||
|
||||
|
||||
# make get_mode_loss_progression parallelizable
|
||||
gmlp = joblib.delayed(get_mode_loss_progression)
|
||||
|
||||
####
|
||||
# Aggregate multiple simulations
|
||||
|
||||
# don't use the tqdm progress bar if there are some logs
|
||||
range_N = range(N) if VERBOSE else tqdm(range(N))
|
||||
|
||||
# for my 8-core computer, n_jobs=7 is empirically the best value
|
||||
loss_history = joblib.Parallel(n_jobs=7)(
|
||||
gmlp(all_orderings,
|
||||
number_of_steps,
|
||||
orders_added_each_step=GRANULARITY)
|
||||
for _ in range_N
|
||||
)
|
||||
loss_history = np.array(loss_history)
|
||||
|
||||
# the sum of losses for each number of steps
|
||||
losses = np.sum(loss_history, axis=0)
|
||||
|
||||
if VERBOSE: print("losses :", losses, sep="\n")
|
||||
|
||||
#####
|
||||
# average
|
||||
# since losses is the sum of losses, losses/N is the average
|
||||
mean = losses / N
|
||||
plt.plot(mean, color="green", label="loss average")
|
||||
|
||||
#####
|
||||
# standard deviation
|
||||
# variance is (average of squares) - (square of the average)
|
||||
# since we only have 1 or 0, average of squares is just the average
|
||||
# so the variance is average - average**2
|
||||
# stddev is the square root of variance
|
||||
stddev = np.sqrt(mean - mean**2)
|
||||
plt.plot(stddev, color="grey", label="loss standard deviation")
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
# CONFIDENCE INTERVALS
|
||||
|
||||
X = np.arange(mean.size) # the x axis
|
||||
|
||||
######
|
||||
## confidence interval
|
||||
## assuming the experimental variance is the correct one
|
||||
#confidence = 0.95
|
||||
#alpha = 1 - confidence
|
||||
#eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
|
||||
#epsilon = eta * stddev / np.sqrt(N)
|
||||
#plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="blue", alpha=0.25,
|
||||
# label=f"{100*confidence}% confidence interval")
|
||||
|
||||
#####
|
||||
# confidence interval
|
||||
# assuming each summed distribution is a normal distribution
|
||||
confidence = 0.999999
|
||||
delta = 1 - confidence
|
||||
|
||||
# corrected sample variance
|
||||
S = np.sqrt((1 / N-1) * (mean - mean**2))
|
||||
|
||||
eta = Student(df=N-1).ppf(1 - delta/2)
|
||||
epsilon = eta * stddev / np.sqrt(N)
|
||||
plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
color="green", alpha=0.2,
|
||||
label=f"{100*confidence}% confidence interval")
|
||||
|
||||
# confidence = 0.95
|
||||
# delta = 1 - confidence
|
||||
# eta = Student(df=X-1).ppf(1 - delta/2)
|
||||
# epsilon = eta * stddev / np.sqrt(X)
|
||||
# plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="green", alpha=0.5,
|
||||
# label=f"{100*confidence}% confidence interval")
|
||||
|
||||
######
|
||||
## beta distribution
|
||||
## confidence = 0.95
|
||||
#delta = 1 - confidence
|
||||
#alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
|
||||
#beta = np.cumsum(loss_history, axis=1).mean(axis=0)
|
||||
#epsilon = Beta.ppf(1 - delta/2, alpha, beta)
|
||||
#plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="orange", alpha=0.30,
|
||||
# label=f"{100*confidence} β confidence interval")
|
||||
|
||||
|
||||
######
|
||||
## fluctuation interval
|
||||
#confidence = 0.1
|
||||
#alpha = 1-confidence
|
||||
#k = Norm.ppf(alpha/2, loc=0, scale=1)
|
||||
#fluctuation = k * stddev
|
||||
#plt.fill_between(X, mean - fluctuation, mean + fluctuation,
|
||||
# color="orange", alpha=0.25,
|
||||
# label=f"{100*confidence}% fluctuation interval")
|
||||
|
||||
#####
|
||||
# hoeffding
|
||||
t = 0.99
|
||||
plt.plot(X, np.exp(-2 * t ** 2 / X),
|
||||
color="red")
|
||||
|
||||
######
|
||||
## y = 1/2
|
||||
#plt.plot([0, mean.size], [0.5, 0.5],
|
||||
# color="orange", alpha=0.25)
|
||||
|
||||
if __name__ == '__main__':
|
||||
rankings = np.array([[1, 3, 2, 4],
|
||||
[3, 4, 2, 1],
|
||||
[1, 2, 3, 4],
|
||||
[1, 3, 2, 4],
|
||||
[2, 3, 1, 4],
|
||||
[1, 3, 2, 1],
|
||||
[2, 3, 1, 4],
|
||||
[2, 3, 1, 4]])
|
||||
|
||||
# all_orderings = find_orderings(parameter=PARAMETER,
|
||||
# summed_attribute=SUMMED_ATTRIBUTE,
|
||||
# criterion=CRITERION,
|
||||
# length=LENGTH)
|
||||
# # print(all_orderings)
|
||||
# print(f"There are {len(all_orderings)} orderings in `all_orderings`")
|
||||
|
||||
# for _ in range(20):
|
||||
# dep = time()
|
||||
# plot_modal_losses()
|
||||
# print(round(time()-dep, 4))
|
||||
|
||||
plt.style.use('dark_background')
|
||||
|
||||
# HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
|
||||
# plot_modal_losses()
|
||||
HYPOTHESIS_ORDERING = ("bisque", "blue")
|
||||
plot_modal_losses()
|
||||
plt.legend()
|
||||
|
||||
ax = plt.gca()
|
||||
# ax.set_ylim([0, 1])
|
||||
|
||||
# plt.ion()
|
||||
plt.show()
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user