moved into old_concentration_test.py
This commit is contained in:
parent
87ae317c31
commit
0835eff420
@ -1,425 +0,0 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.colors import CSS4_COLORS
|
||||
import numpy as np
|
||||
from scipy.stats import norm as Norm, beta as Beta, t as Student
|
||||
from tprint import tprint
|
||||
import orderankings as odrk
|
||||
from querying import find_orderings
|
||||
from kemeny_young import kendall_tau_dist, rank_aggregation
|
||||
from tqdm import tqdm
|
||||
from collections import Counter, defaultdict
|
||||
import joblib
|
||||
from functools import partial
|
||||
import random
|
||||
|
||||
# Random number generator for the whole program
|
||||
RNG = np.random.default_rng(1234)
|
||||
|
||||
VERBOSE = True
|
||||
VERBOSE = False
|
||||
|
||||
|
||||
################## DATA SETTINGS (parameters, hypothesis...) ###################
|
||||
|
||||
# """ comment this line when using the SSB dataset
|
||||
# SSB dataset settings # {{{
|
||||
|
||||
PARAMETER = "p_color"
|
||||
SUMMED_ATTRIBUTE = "lo_quantity"
|
||||
# SUMMED_ATTRIBUTE = "lo_revenue"
|
||||
# SUMMED_ATTRIBUTE = "lo_extendedprice"
|
||||
LENGTH = 2
|
||||
|
||||
authorized_parameter_values = {
|
||||
"p_size": tuple(map(int, range(50))),
|
||||
"p_color": tuple(CSS4_COLORS.keys()),
|
||||
}
|
||||
AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER]
|
||||
|
||||
CRITERION = (
|
||||
##### customer table
|
||||
# "c_region",
|
||||
"c_city",
|
||||
# "c_nation",
|
||||
|
||||
##### part table
|
||||
"p_category",
|
||||
"p_brand",
|
||||
# "p_mfgr",
|
||||
# "p_color",
|
||||
# "p_type",
|
||||
# "p_container",
|
||||
|
||||
##### supplier table
|
||||
"s_city",
|
||||
# "s_nation",
|
||||
# "s_region",
|
||||
|
||||
##### order date
|
||||
# "D_DATE",
|
||||
# "D_DATEKEY",
|
||||
# "D_DATE",
|
||||
# "D_DAYOFWEEK",
|
||||
# "D_MONTH",
|
||||
# "D_YEAR",
|
||||
# "D_YEARMONTHNUM",
|
||||
# "D_YEARMONTH",
|
||||
# "D_DAYNUMINWEEK"
|
||||
# "D_DAYNUMINMONTH",
|
||||
# "D_DAYNUMINYEAR",
|
||||
# "D_MONTHNUMINYEAR",
|
||||
# "D_WEEKNUMINYEAR",
|
||||
# "D_SELLINGSEASON",
|
||||
# "D_LASTDAYINWEEKFL",
|
||||
# "D_LASTDAYINMONTHFL",
|
||||
# "D_HOLIDAYFL",
|
||||
# "D_WEEKDAYFL",
|
||||
)
|
||||
|
||||
HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
|
||||
HYPOTHESIS_ORDERING = ("bisque", "blue")
|
||||
|
||||
# HYPOTHESIS_ORDERING = [2, 32]
|
||||
# HYPOTHESIS_ORDERING = [30, 18]
|
||||
# HYPOTHESIS_ORDERING = [37, 49, 10]
|
||||
|
||||
|
||||
# }}}
|
||||
""" # flight_delay dataset settings {{{
|
||||
|
||||
PARAMETER = "departure_airport"
|
||||
SUMMED_ATTRIBUTE = "nb_flights"
|
||||
LENGTH = 3
|
||||
|
||||
CRITERION = (
|
||||
# "airline",
|
||||
"departure_hour", # simpson's paradox ?
|
||||
# "day",
|
||||
# "month",
|
||||
# "year",
|
||||
)
|
||||
|
||||
|
||||
GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS',
|
||||
'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA',
|
||||
'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK']
|
||||
AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING
|
||||
|
||||
|
||||
# Correct hypothesis for each length (so the loss converges to 0)
|
||||
CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING)
|
||||
CORRECT_ORDERINGS[2] = ['ATL', 'DEN']
|
||||
CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD']
|
||||
CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD']
|
||||
CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX']
|
||||
# now select the right one according to the LENGTH
|
||||
CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH]
|
||||
|
||||
# Use the correct ordering
|
||||
HYPOTHESIS_ORDERING = CORRECT_ORDERING
|
||||
print(HYPOTHESIS_ORDERING)
|
||||
|
||||
|
||||
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX']
|
||||
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH]
|
||||
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH]
|
||||
# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX'] # interesting loss curve
|
||||
|
||||
assert len(HYPOTHESIS_ORDERING) == LENGTH
|
||||
|
||||
# }}}
|
||||
# """
|
||||
|
||||
|
||||
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
|
||||
"""This loss is the the average of kendall tau distances between the truth
|
||||
and each ordering."""
|
||||
rankings = odrk.rankings_from_orderings(orderings)
|
||||
true_ranking = odrk.rankings_from_orderings([truth])[0]
|
||||
return rankings_average_loss(rankings, true_ranking)# }}}
|
||||
|
||||
|
||||
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
|
||||
distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
|
||||
length = len(rankings)
|
||||
# apparently, this is what works for a good normalization
|
||||
return distance / length
|
||||
# return distance * 2 / (length * (length - 1))}}}
|
||||
|
||||
|
||||
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
|
||||
"""Return the kendall tau distance between the truth and the kemeny-young
|
||||
aggregation of orderings"""
|
||||
_, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
|
||||
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
|
||||
loss = kendall_tau_dist(
|
||||
odrk.ranking_from_ordering(aggregation),
|
||||
odrk.ranking_from_ordering(truth))
|
||||
return loss
|
||||
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
|
||||
|
||||
|
||||
def get_loss_progression(): # {{{
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH)
|
||||
RNG.shuffle(grouped_orderings)
|
||||
|
||||
average_losses = []
|
||||
kendal_aggregation_losses = []
|
||||
|
||||
for nb_considered_orderings in range(1, len(grouped_orderings)+1):
|
||||
# loss as the average distance from truth to all considered orderings
|
||||
considered_orderings = grouped_orderings[:nb_considered_orderings]
|
||||
loss = orderings_average_loss(orderings=considered_orderings,
|
||||
truth=HYPOTHESIS_ORDERING)
|
||||
|
||||
# loss as the distance between truth and the aggregation
|
||||
kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
|
||||
truth=HYPOTHESIS_ORDERING)
|
||||
kendal_aggregation_losses.append(kdl_agg_loss)
|
||||
|
||||
if VERBOSE:
|
||||
print(f"using {nb_considered_orderings} orderings")
|
||||
tprint(considered_orderings)
|
||||
print("truth :", HYPOTHESIS_ORDERING)
|
||||
print("loss =", loss)
|
||||
average_losses.append(loss)
|
||||
return average_losses, kendal_aggregation_losses
|
||||
# }}}
|
||||
|
||||
def plot_loss_progression(): # {{{
|
||||
"""Plot the progression of losses when using more and more of the values
|
||||
(see get_loss_progression)."""
|
||||
N = 20
|
||||
|
||||
avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
|
||||
avg_loss_progression = np.array(avg_loss_progression)
|
||||
kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
|
||||
|
||||
for _ in tqdm(range(N-1), leave=False):
|
||||
avg_lp, kmny_lp = get_loss_progression()
|
||||
avg_loss_progression += avg_lp
|
||||
kdl_agg_loss_progression += kmny_lp
|
||||
# print(progression)
|
||||
if VERBOSE:
|
||||
print(avg_loss_progression)
|
||||
print(kdl_agg_loss_progression)
|
||||
plt.plot(avg_loss_progression, color="orange")
|
||||
plt.plot(kdl_agg_loss_progression, color="green")
|
||||
# }}}
|
||||
|
||||
def get_mode_loss_progression(all_orderings: list[list[str]],
|
||||
number_of_steps: int,
|
||||
orders_added_each_step: int =1) -> list[bool]:
|
||||
|
||||
# all_rankings = odrk.rankings_from_orderings(all_orderings)
|
||||
|
||||
# considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
|
||||
considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
|
||||
# count occurrences of each ordering
|
||||
orderings_count = Counter(map(tuple, considered_orderings))
|
||||
|
||||
# loss progression when adding more and more orderings
|
||||
loss_history = np.zeros(number_of_steps)
|
||||
|
||||
# # random permutation of the orderings
|
||||
# permuted_orderings = np.random.permutation(all_orderings)
|
||||
|
||||
for idx in range(number_of_steps):
|
||||
# new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
|
||||
new_orders = random.choices(all_orderings, k=orders_added_each_step)
|
||||
# new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
|
||||
|
||||
# considered_orderings.extend(new_orders)
|
||||
# update the counter of orderings occurrences
|
||||
orderings_count.update(Counter(map(tuple, new_orders)))
|
||||
# the most common (modal) ordering
|
||||
modal_ordering = orderings_count.most_common()[0][0]
|
||||
modal_ordering = np.array(modal_ordering)
|
||||
# if VERBOSE: print(modal_ordering)
|
||||
# the loss is 1 if the modal ordering is the same as the hypothesis
|
||||
loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
|
||||
# loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
|
||||
# loss = int(all(map(lambda x: x[0]==x[1],
|
||||
# zip(modal_ordering, HYPOTHESIS_ORDERING))))
|
||||
# add loss to the list of losses
|
||||
loss_history[idx] = loss
|
||||
if VERBOSE:
|
||||
# print(loss_history, HYPOTHESIS_ORDERING)
|
||||
print(orderings_count.most_common(1)[0])
|
||||
return np.repeat(loss_history, orders_added_each_step)
|
||||
|
||||
|
||||
################################################################################
|
||||
|
||||
def plot_modal_losses():
|
||||
###################
|
||||
# sampling settings
|
||||
N = 100 # number of repetitions of the experiment
|
||||
max_number_of_orders = 7500 # max sample size
|
||||
GRANULARITY = 12 # granularity of the sampling (orders by iteration)
|
||||
|
||||
number_of_steps = max_number_of_orders // GRANULARITY
|
||||
|
||||
all_orderings = find_orderings(
|
||||
parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH,
|
||||
authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
|
||||
|
||||
print(f"there are {all_orderings.size} orders in total :")
|
||||
tprint(all_orderings, limit=10)
|
||||
|
||||
|
||||
# make get_mode_loss_progression parallelizable
|
||||
gmlp = joblib.delayed(get_mode_loss_progression)
|
||||
|
||||
####
|
||||
# Aggregate multiple simulations
|
||||
|
||||
# don't use the tqdm progress bar if there are some logs
|
||||
range_N = range(N) if VERBOSE else tqdm(range(N))
|
||||
|
||||
# for my 8-core computer, n_jobs=7 is empirically the best value
|
||||
loss_history = joblib.Parallel(n_jobs=7)(
|
||||
gmlp(all_orderings,
|
||||
number_of_steps,
|
||||
orders_added_each_step=GRANULARITY)
|
||||
for _ in range_N
|
||||
)
|
||||
loss_history = np.array(loss_history)
|
||||
|
||||
# the sum of losses for each number of steps
|
||||
losses = np.sum(loss_history, axis=0)
|
||||
|
||||
if VERBOSE: print("losses :", losses, sep="\n")
|
||||
|
||||
#####
|
||||
# average
|
||||
# since losses is the sum of losses, losses/N is the average
|
||||
mean = losses / N
|
||||
plt.plot(mean, color="green", label="loss average")
|
||||
|
||||
#####
|
||||
# standard deviation
|
||||
# variance is (average of squares) - (square of the average)
|
||||
# since we only have 1 or 0, average of squares is just the average
|
||||
# so the variance is average - average**2
|
||||
# stddev is the square root of variance
|
||||
stddev = np.sqrt(mean - mean**2)
|
||||
plt.plot(stddev, color="grey", label="loss standard deviation")
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
# CONFIDENCE INTERVALS
|
||||
|
||||
X = np.arange(mean.size) # the x axis
|
||||
|
||||
######
|
||||
## confidence interval
|
||||
## assuming the experimental variance is the correct one
|
||||
#confidence = 0.95
|
||||
#alpha = 1 - confidence
|
||||
#eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
|
||||
#epsilon = eta * stddev / np.sqrt(N)
|
||||
#plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="blue", alpha=0.25,
|
||||
# label=f"{100*confidence}% confidence interval")
|
||||
|
||||
#####
|
||||
# confidence interval
|
||||
# assuming each summed distribution is a normal distribution
|
||||
confidence = 0.999999
|
||||
delta = 1 - confidence
|
||||
|
||||
# corrected sample variance
|
||||
S = np.sqrt((1 / N-1) * (mean - mean**2))
|
||||
|
||||
eta = Student(df=N-1).ppf(1 - delta/2)
|
||||
epsilon = eta * stddev / np.sqrt(N)
|
||||
plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
color="green", alpha=0.2,
|
||||
label=f"{100*confidence}% confidence interval")
|
||||
|
||||
# confidence = 0.95
|
||||
# delta = 1 - confidence
|
||||
# eta = Student(df=X-1).ppf(1 - delta/2)
|
||||
# epsilon = eta * stddev / np.sqrt(X)
|
||||
# plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="green", alpha=0.5,
|
||||
# label=f"{100*confidence}% confidence interval")
|
||||
|
||||
######
|
||||
## beta distribution
|
||||
## confidence = 0.95
|
||||
#delta = 1 - confidence
|
||||
#alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
|
||||
#beta = np.cumsum(loss_history, axis=1).mean(axis=0)
|
||||
#epsilon = Beta.ppf(1 - delta/2, alpha, beta)
|
||||
#plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="orange", alpha=0.30,
|
||||
# label=f"{100*confidence} β confidence interval")
|
||||
|
||||
|
||||
######
|
||||
## fluctuation interval
|
||||
#confidence = 0.1
|
||||
#alpha = 1-confidence
|
||||
#k = Norm.ppf(alpha/2, loc=0, scale=1)
|
||||
#fluctuation = k * stddev
|
||||
#plt.fill_between(X, mean - fluctuation, mean + fluctuation,
|
||||
# color="orange", alpha=0.25,
|
||||
# label=f"{100*confidence}% fluctuation interval")
|
||||
|
||||
######
|
||||
## hoeffding
|
||||
#t = 0.9999999
|
||||
#plt.plot(X, 2 * np.exp(-2 * t ** 2 / X),
|
||||
# color="red")
|
||||
|
||||
######
|
||||
## y = 1/2
|
||||
#plt.plot([0, mean.size], [0.5, 0.5],
|
||||
# color="orange", alpha=0.25)
|
||||
|
||||
if __name__ == '__main__':
|
||||
rankings = np.array([[1, 3, 2, 4],
|
||||
[3, 4, 2, 1],
|
||||
[1, 2, 3, 4],
|
||||
[1, 3, 2, 4],
|
||||
[2, 3, 1, 4],
|
||||
[1, 3, 2, 1],
|
||||
[2, 3, 1, 4],
|
||||
[2, 3, 1, 4]])
|
||||
|
||||
# all_orderings = find_orderings(parameter=PARAMETER,
|
||||
# summed_attribute=SUMMED_ATTRIBUTE,
|
||||
# criterion=CRITERION,
|
||||
# length=LENGTH)
|
||||
# # print(all_orderings)
|
||||
# print(f"There are {len(all_orderings)} orderings in `all_orderings`")
|
||||
|
||||
# for _ in range(20):
|
||||
# dep = time()
|
||||
# plot_modal_losses()
|
||||
# print(round(time()-dep, 4))
|
||||
|
||||
plt.style.use('dark_background')
|
||||
|
||||
# HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
|
||||
# plot_modal_losses()
|
||||
HYPOTHESIS_ORDERING = ("bisque", "blue")
|
||||
plot_modal_losses()
|
||||
plt.legend()
|
||||
|
||||
ax = plt.gca()
|
||||
# ax.set_ylim([0, 1])
|
||||
|
||||
# plt.ion()
|
||||
plt.show()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user