Compare commits
7 Commits
cb3feb6a4d
...
f84aec4456
Author | SHA1 | Date | |
---|---|---|---|
|
f84aec4456 | ||
|
b13f8ab039 | ||
|
ff0f646d04 | ||
|
0835eff420 | ||
|
87ae317c31 | ||
|
6a88756e84 | ||
|
be6cbf46c6 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,7 +2,7 @@ bin/*
|
||||
include/*
|
||||
lib/*
|
||||
share/*
|
||||
pyenv.cfg
|
||||
pyvenv.cfg
|
||||
src/__pycache__/*
|
||||
.DS_Store
|
||||
modal_losses N=1000 nb_orders=10000 granularity=1.png
|
||||
|
9
Makefile
9
Makefile
@@ -5,12 +5,15 @@ DATABASE_FILE=${DATABASE_FOLDER}/${DATABASE_NAME}.db
|
||||
|
||||
all: execute-script
|
||||
|
||||
execute-script:
|
||||
python3 concentration_test.py
|
||||
execute-script: requirements.txt
|
||||
source bin/activate; \
|
||||
python3 src/concentration_test.py; \
|
||||
|
||||
pip-install:
|
||||
requirements.txt:
|
||||
bin/pip3 install -r requirements.txt
|
||||
|
||||
# run o
|
||||
|
||||
reset: delete-database import-from-csv
|
||||
|
||||
open:
|
||||
|
@@ -5,3 +5,4 @@ fastcache
|
||||
tqdm
|
||||
joblib
|
||||
scipy
|
||||
PyYAML
|
||||
|
@@ -1,425 +0,0 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.colors import CSS4_COLORS
|
||||
import numpy as np
|
||||
from scipy.stats import norm as Norm, beta as Beta, t as Student
|
||||
from tprint import tprint
|
||||
import orderankings as odrk
|
||||
from querying import find_orderings
|
||||
from kemeny_young import kendall_tau_dist, rank_aggregation
|
||||
from tqdm import tqdm
|
||||
from collections import Counter, defaultdict
|
||||
import joblib
|
||||
from functools import partial
|
||||
import random
|
||||
|
||||
# Random number generator for the whole program
|
||||
RNG = np.random.default_rng(1234)
|
||||
|
||||
VERBOSE = True
|
||||
VERBOSE = False
|
||||
|
||||
|
||||
################## DATA SETTINGS (parameters, hypothesis...) ###################
|
||||
|
||||
# """ comment this line when using the SSB dataset
|
||||
# SSB dataset settings # {{{
|
||||
|
||||
PARAMETER = "p_color"
|
||||
SUMMED_ATTRIBUTE = "lo_quantity"
|
||||
# SUMMED_ATTRIBUTE = "lo_revenue"
|
||||
# SUMMED_ATTRIBUTE = "lo_extendedprice"
|
||||
LENGTH = 2
|
||||
|
||||
authorized_parameter_values = {
|
||||
"p_size": tuple(map(int, range(50))),
|
||||
"p_color": tuple(CSS4_COLORS.keys()),
|
||||
}
|
||||
AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER]
|
||||
|
||||
CRITERION = (
|
||||
##### customer table
|
||||
# "c_region",
|
||||
"c_city",
|
||||
# "c_nation",
|
||||
|
||||
##### part table
|
||||
"p_category",
|
||||
"p_brand",
|
||||
# "p_mfgr",
|
||||
# "p_color",
|
||||
# "p_type",
|
||||
# "p_container",
|
||||
|
||||
##### supplier table
|
||||
"s_city",
|
||||
# "s_nation",
|
||||
# "s_region",
|
||||
|
||||
##### order date
|
||||
# "D_DATE",
|
||||
# "D_DATEKEY",
|
||||
# "D_DATE",
|
||||
# "D_DAYOFWEEK",
|
||||
# "D_MONTH",
|
||||
# "D_YEAR",
|
||||
# "D_YEARMONTHNUM",
|
||||
# "D_YEARMONTH",
|
||||
# "D_DAYNUMINWEEK"
|
||||
# "D_DAYNUMINMONTH",
|
||||
# "D_DAYNUMINYEAR",
|
||||
# "D_MONTHNUMINYEAR",
|
||||
# "D_WEEKNUMINYEAR",
|
||||
# "D_SELLINGSEASON",
|
||||
# "D_LASTDAYINWEEKFL",
|
||||
# "D_LASTDAYINMONTHFL",
|
||||
# "D_HOLIDAYFL",
|
||||
# "D_WEEKDAYFL",
|
||||
)
|
||||
|
||||
HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
|
||||
HYPOTHESIS_ORDERING = ("bisque", "blue")
|
||||
|
||||
# HYPOTHESIS_ORDERING = [2, 32]
|
||||
# HYPOTHESIS_ORDERING = [30, 18]
|
||||
# HYPOTHESIS_ORDERING = [37, 49, 10]
|
||||
|
||||
|
||||
# }}}
|
||||
""" # flight_delay dataset settings {{{
|
||||
|
||||
PARAMETER = "departure_airport"
|
||||
SUMMED_ATTRIBUTE = "nb_flights"
|
||||
LENGTH = 3
|
||||
|
||||
CRITERION = (
|
||||
# "airline",
|
||||
"departure_hour", # simpson's paradox ?
|
||||
# "day",
|
||||
# "month",
|
||||
# "year",
|
||||
)
|
||||
|
||||
|
||||
GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS',
|
||||
'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA',
|
||||
'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK']
|
||||
AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING
|
||||
|
||||
|
||||
# Correct hypothesis for each length (so the loss converges to 0)
|
||||
CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING)
|
||||
CORRECT_ORDERINGS[2] = ['ATL', 'DEN']
|
||||
CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD']
|
||||
CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD']
|
||||
CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX']
|
||||
# now select the right one according to the LENGTH
|
||||
CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH]
|
||||
|
||||
# Use the correct ordering
|
||||
HYPOTHESIS_ORDERING = CORRECT_ORDERING
|
||||
print(HYPOTHESIS_ORDERING)
|
||||
|
||||
|
||||
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX']
|
||||
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH]
|
||||
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH]
|
||||
# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX'] # interesting loss curve
|
||||
|
||||
assert len(HYPOTHESIS_ORDERING) == LENGTH
|
||||
|
||||
# }}}
|
||||
# """
|
||||
|
||||
|
||||
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
|
||||
"""This loss is the the average of kendall tau distances between the truth
|
||||
and each ordering."""
|
||||
rankings = odrk.rankings_from_orderings(orderings)
|
||||
true_ranking = odrk.rankings_from_orderings([truth])[0]
|
||||
return rankings_average_loss(rankings, true_ranking)# }}}
|
||||
|
||||
|
||||
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
|
||||
distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
|
||||
length = len(rankings)
|
||||
# apparently, this is what works for a good normalization
|
||||
return distance / length
|
||||
# return distance * 2 / (length * (length - 1))}}}
|
||||
|
||||
|
||||
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
|
||||
"""Return the kendall tau distance between the truth and the kemeny-young
|
||||
aggregation of orderings"""
|
||||
_, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
|
||||
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
|
||||
loss = kendall_tau_dist(
|
||||
odrk.ranking_from_ordering(aggregation),
|
||||
odrk.ranking_from_ordering(truth))
|
||||
return loss
|
||||
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
|
||||
|
||||
|
||||
def get_loss_progression(): # {{{
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH)
|
||||
RNG.shuffle(grouped_orderings)
|
||||
|
||||
average_losses = []
|
||||
kendal_aggregation_losses = []
|
||||
|
||||
for nb_considered_orderings in range(1, len(grouped_orderings)+1):
|
||||
# loss as the average distance from truth to all considered orderings
|
||||
considered_orderings = grouped_orderings[:nb_considered_orderings]
|
||||
loss = orderings_average_loss(orderings=considered_orderings,
|
||||
truth=HYPOTHESIS_ORDERING)
|
||||
|
||||
# loss as the distance between truth and the aggregation
|
||||
kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
|
||||
truth=HYPOTHESIS_ORDERING)
|
||||
kendal_aggregation_losses.append(kdl_agg_loss)
|
||||
|
||||
if VERBOSE:
|
||||
print(f"using {nb_considered_orderings} orderings")
|
||||
tprint(considered_orderings)
|
||||
print("truth :", HYPOTHESIS_ORDERING)
|
||||
print("loss =", loss)
|
||||
average_losses.append(loss)
|
||||
return average_losses, kendal_aggregation_losses
|
||||
# }}}
|
||||
|
||||
def plot_loss_progression(): # {{{
|
||||
"""Plot the progression of losses when using more and more of the values
|
||||
(see get_loss_progression)."""
|
||||
N = 20
|
||||
|
||||
avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
|
||||
avg_loss_progression = np.array(avg_loss_progression)
|
||||
kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
|
||||
|
||||
for _ in tqdm(range(N-1), leave=False):
|
||||
avg_lp, kmny_lp = get_loss_progression()
|
||||
avg_loss_progression += avg_lp
|
||||
kdl_agg_loss_progression += kmny_lp
|
||||
# print(progression)
|
||||
if VERBOSE:
|
||||
print(avg_loss_progression)
|
||||
print(kdl_agg_loss_progression)
|
||||
plt.plot(avg_loss_progression, color="orange")
|
||||
plt.plot(kdl_agg_loss_progression, color="green")
|
||||
# }}}
|
||||
|
||||
def get_mode_loss_progression(all_orderings: list[list[str]],
|
||||
number_of_steps: int,
|
||||
orders_added_each_step: int =1) -> list[bool]:
|
||||
|
||||
# all_rankings = odrk.rankings_from_orderings(all_orderings)
|
||||
|
||||
# considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
|
||||
considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
|
||||
# count occurrences of each ordering
|
||||
orderings_count = Counter(map(tuple, considered_orderings))
|
||||
|
||||
# loss progression when adding more and more orderings
|
||||
loss_history = np.zeros(number_of_steps)
|
||||
|
||||
# # random permutation of the orderings
|
||||
# permuted_orderings = np.random.permutation(all_orderings)
|
||||
|
||||
for idx in range(number_of_steps):
|
||||
# new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
|
||||
new_orders = random.choices(all_orderings, k=orders_added_each_step)
|
||||
# new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
|
||||
|
||||
# considered_orderings.extend(new_orders)
|
||||
# update the counter of orderings occurrences
|
||||
orderings_count.update(Counter(map(tuple, new_orders)))
|
||||
# the most common (modal) ordering
|
||||
modal_ordering = orderings_count.most_common()[0][0]
|
||||
modal_ordering = np.array(modal_ordering)
|
||||
# if VERBOSE: print(modal_ordering)
|
||||
# the loss is 1 if the modal ordering is the same as the hypothesis
|
||||
loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
|
||||
# loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
|
||||
# loss = int(all(map(lambda x: x[0]==x[1],
|
||||
# zip(modal_ordering, HYPOTHESIS_ORDERING))))
|
||||
# add loss to the list of losses
|
||||
loss_history[idx] = loss
|
||||
if VERBOSE:
|
||||
# print(loss_history, HYPOTHESIS_ORDERING)
|
||||
print(orderings_count.most_common(1)[0])
|
||||
return np.repeat(loss_history, orders_added_each_step)
|
||||
|
||||
|
||||
################################################################################
|
||||
|
||||
def plot_modal_losses():
|
||||
###################
|
||||
# sampling settings
|
||||
N = 100 # number of repetitions of the experiment
|
||||
max_number_of_orders = 7500 # max sample size
|
||||
GRANULARITY = 12 # granularity of the sampling (orders by iteration)
|
||||
|
||||
number_of_steps = max_number_of_orders // GRANULARITY
|
||||
|
||||
all_orderings = find_orderings(
|
||||
parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH,
|
||||
authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
|
||||
|
||||
print(f"there are {all_orderings.size} orders in total :")
|
||||
tprint(all_orderings, limit=10)
|
||||
|
||||
|
||||
# make get_mode_loss_progression parallelizable
|
||||
gmlp = joblib.delayed(get_mode_loss_progression)
|
||||
|
||||
####
|
||||
# Aggregate multiple simulations
|
||||
|
||||
# don't use the tqdm progress bar if there are some logs
|
||||
range_N = range(N) if VERBOSE else tqdm(range(N))
|
||||
|
||||
# for my 8-core computer, n_jobs=7 is empirically the best value
|
||||
loss_history = joblib.Parallel(n_jobs=7)(
|
||||
gmlp(all_orderings,
|
||||
number_of_steps,
|
||||
orders_added_each_step=GRANULARITY)
|
||||
for _ in range_N
|
||||
)
|
||||
loss_history = np.array(loss_history)
|
||||
|
||||
# the sum of losses for each number of steps
|
||||
losses = np.sum(loss_history, axis=0)
|
||||
|
||||
if VERBOSE: print("losses :", losses, sep="\n")
|
||||
|
||||
#####
|
||||
# average
|
||||
# since losses is the sum of losses, losses/N is the average
|
||||
mean = losses / N
|
||||
plt.plot(mean, color="green", label="loss average")
|
||||
|
||||
#####
|
||||
# standard deviation
|
||||
# variance is (average of squares) - (square of the average)
|
||||
# since we only have 1 or 0, average of squares is just the average
|
||||
# so the variance is average - average**2
|
||||
# stddev is the square root of variance
|
||||
stddev = np.sqrt(mean - mean**2)
|
||||
plt.plot(stddev, color="grey", label="loss standard deviation")
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
# CONFIDENCE INTERVALS
|
||||
|
||||
X = np.arange(mean.size) # the x axis
|
||||
|
||||
######
|
||||
## confidence interval
|
||||
## assuming the experimental variance is the correct one
|
||||
#confidence = 0.95
|
||||
#alpha = 1 - confidence
|
||||
#eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
|
||||
#epsilon = eta * stddev / np.sqrt(N)
|
||||
#plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="blue", alpha=0.25,
|
||||
# label=f"{100*confidence}% confidence interval")
|
||||
|
||||
#####
|
||||
# confidence interval
|
||||
# assuming each summed distribution is a normal distribution
|
||||
confidence = 0.999999
|
||||
delta = 1 - confidence
|
||||
|
||||
# corrected sample variance
|
||||
S = np.sqrt((1 / N-1) * (mean - mean**2))
|
||||
|
||||
eta = Student(df=N-1).ppf(1 - delta/2)
|
||||
epsilon = eta * stddev / np.sqrt(N)
|
||||
plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
color="green", alpha=0.2,
|
||||
label=f"{100*confidence}% confidence interval")
|
||||
|
||||
# confidence = 0.95
|
||||
# delta = 1 - confidence
|
||||
# eta = Student(df=X-1).ppf(1 - delta/2)
|
||||
# epsilon = eta * stddev / np.sqrt(X)
|
||||
# plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="green", alpha=0.5,
|
||||
# label=f"{100*confidence}% confidence interval")
|
||||
|
||||
######
|
||||
## beta distribution
|
||||
## confidence = 0.95
|
||||
#delta = 1 - confidence
|
||||
#alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
|
||||
#beta = np.cumsum(loss_history, axis=1).mean(axis=0)
|
||||
#epsilon = Beta.ppf(1 - delta/2, alpha, beta)
|
||||
#plt.fill_between(X, mean - epsilon, mean + epsilon,
|
||||
# color="orange", alpha=0.30,
|
||||
# label=f"{100*confidence} β confidence interval")
|
||||
|
||||
|
||||
######
|
||||
## fluctuation interval
|
||||
#confidence = 0.1
|
||||
#alpha = 1-confidence
|
||||
#k = Norm.ppf(alpha/2, loc=0, scale=1)
|
||||
#fluctuation = k * stddev
|
||||
#plt.fill_between(X, mean - fluctuation, mean + fluctuation,
|
||||
# color="orange", alpha=0.25,
|
||||
# label=f"{100*confidence}% fluctuation interval")
|
||||
|
||||
######
|
||||
## hoeffding
|
||||
#t = 0.9999999
|
||||
#plt.plot(X, 2 * np.exp(-2 * t ** 2 / X),
|
||||
# color="red")
|
||||
|
||||
######
|
||||
## y = 1/2
|
||||
#plt.plot([0, mean.size], [0.5, 0.5],
|
||||
# color="orange", alpha=0.25)
|
||||
|
||||
if __name__ == '__main__':
|
||||
rankings = np.array([[1, 3, 2, 4],
|
||||
[3, 4, 2, 1],
|
||||
[1, 2, 3, 4],
|
||||
[1, 3, 2, 4],
|
||||
[2, 3, 1, 4],
|
||||
[1, 3, 2, 1],
|
||||
[2, 3, 1, 4],
|
||||
[2, 3, 1, 4]])
|
||||
|
||||
# all_orderings = find_orderings(parameter=PARAMETER,
|
||||
# summed_attribute=SUMMED_ATTRIBUTE,
|
||||
# criterion=CRITERION,
|
||||
# length=LENGTH)
|
||||
# # print(all_orderings)
|
||||
# print(f"There are {len(all_orderings)} orderings in `all_orderings`")
|
||||
|
||||
# for _ in range(20):
|
||||
# dep = time()
|
||||
# plot_modal_losses()
|
||||
# print(round(time()-dep, 4))
|
||||
|
||||
plt.style.use('dark_background')
|
||||
|
||||
# HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
|
||||
# plot_modal_losses()
|
||||
HYPOTHESIS_ORDERING = ("bisque", "blue")
|
||||
plot_modal_losses()
|
||||
plt.legend()
|
||||
|
||||
ax = plt.gca()
|
||||
# ax.set_ylim([0, 1])
|
||||
|
||||
# plt.ion()
|
||||
plt.show()
|
||||
|
||||
|
||||
|
@@ -1,3 +1,7 @@
|
||||
"""
|
||||
This Module defines functions to compute the kendall tau distance between two
|
||||
rankings, and the kemeny-young rank aggregation method.
|
||||
"""
|
||||
import numpy as np
|
||||
from numba import jit, njit
|
||||
from itertools import permutations
|
||||
@@ -18,7 +22,6 @@ Number = int|float
|
||||
|
||||
|
||||
|
||||
|
||||
def kendall_tau_dist(ranking_a: list[int], ranking_b: list[int]) -> Number:
|
||||
"""The kendall τ distance between two rankings / permutations.
|
||||
It is the number of inversions that don't have the same sign within all pairs of an inversion of ranking_a and an inversion of ranking_b.
|
||||
@@ -42,9 +45,9 @@ def __tau(A: list[int], B: list[int]) -> int:
|
||||
|
||||
|
||||
def rank_aggregation(rankings: list[list[int]]) -> tuple[int, tuple[int, ...]]:
|
||||
"""Brute-force kemeny-young rank aggregation.
|
||||
"""Return the order elected by the kemeny-young method.
|
||||
Args:
|
||||
ranks: A list of the ranks (2D numpy array).
|
||||
ranks: A list of the ranks (2D numpy array) to elect from.
|
||||
Returns:
|
||||
int, list: The minimal sum of distances to ranks, the rank of minimal distance.
|
||||
"""
|
||||
@@ -67,6 +70,9 @@ def rank_aggregation(rankings: list[list[int]]) -> tuple[int, tuple[int, ...]]:
|
||||
return min_dist, best_ranking
|
||||
|
||||
|
||||
|
||||
#################################### TESTS #####################################
|
||||
|
||||
if __name__ == '__main__':
|
||||
ranks = np.array([[0, 1, 2, 3, 4],
|
||||
[0, 1, 3, 2, 4],
|
||||
|
@@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class QueryGenerator(ABC):
|
||||
"""Abstract class to define what methods should a query generator have."""
|
||||
@abstractmethod
|
||||
def __init__(self): ...
|
||||
|
||||
@@ -14,18 +15,23 @@ class QueryGenerator(ABC):
|
||||
def __str__(self) -> str: ...
|
||||
|
||||
|
||||
class QueryWithParameter(QueryGenerator):
|
||||
# DEFAULT_AUTHORIZED_PARAMETER_VALUES: tuple[str, ...] = ("foo", "bar")
|
||||
class QueryWithParameter(QueryGenerator, ABC):
|
||||
"""Abstract class for query generators with our 3 parameters.
|
||||
This class implements the gestion of 3 attributes : `parameter`,
|
||||
`authorized_parameter_values` and `summed_attribute`. They are managed so
|
||||
that there is no typing error, and using default values. Importantly, the
|
||||
default value of authorized_parameter_values (when not given or set to
|
||||
None) is the the value of `self.DEFAULT_AUTHORIZED_PARAMETER_VALUES`.
|
||||
"""
|
||||
|
||||
def __init__(self, parameter: str|None =None,
|
||||
authorized_parameter_values: tuple[str, ...] | None = None,
|
||||
summed_attribute: str|None =None):
|
||||
if parameter is None: raise ValueError
|
||||
self.parameter = str(parameter)
|
||||
self.__parameter = str(parameter)
|
||||
|
||||
if authorized_parameter_values is None:
|
||||
authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
|
||||
self.authorized_parameter_values = authorized_parameter_values
|
||||
self.__authorized_parameter_values = authorized_parameter_values
|
||||
self.__force_typing_on_authorized_parameter_values()
|
||||
|
||||
if summed_attribute is None: raise ValueError
|
||||
self.summed_attribute = str(summed_attribute)
|
||||
@@ -39,6 +45,8 @@ class QueryWithParameter(QueryGenerator):
|
||||
self.__parameter = str(value)
|
||||
|
||||
def __force_typing_on_authorized_parameter_values(self):
|
||||
if self.__authorized_parameter_values is None:
|
||||
self.__authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
|
||||
self.__authorized_parameter_values = tuple(
|
||||
map(str, self.__authorized_parameter_values))
|
||||
|
||||
@@ -54,6 +62,8 @@ class QueryWithParameter(QueryGenerator):
|
||||
|
||||
|
||||
class QueryWithParameterGroupedByCriteria(QueryWithParameter):
|
||||
"""Similar to QueryWithParameter, but with an addtional parameter : `criteria`.
|
||||
The results are grouped by criteria, and values of `summed_attribute` are summed for each `parameter`, to give an order on `parameter`'s values"""
|
||||
|
||||
def __init__(self, parameter: str|None =None,
|
||||
authorized_parameter_values: tuple[str, ...] | None =None,
|
||||
@@ -67,7 +77,7 @@ class QueryWithParameterGroupedByCriteria(QueryWithParameter):
|
||||
authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
|
||||
self.authorized_parameter_values = authorized_parameter_values
|
||||
|
||||
self.criteria = criteria
|
||||
self.__criteria = str(criteria)
|
||||
|
||||
if summed_attribute is None: raise ValueError
|
||||
self.summed_attribute = str(summed_attribute)
|
||||
@@ -162,7 +172,7 @@ class QuerySSBWithParameterGroupedByCriteria(QueryWithParameterGroupedByCriteria
|
||||
res += "INNER JOIN date ON lo_orderdate = D_DATEKEY\n"
|
||||
|
||||
if self.authorized_parameter_values is not None:
|
||||
res += "WHERE {self.parameter} IN {self.authorized_parameter_values}\n"
|
||||
res += f"WHERE {self.parameter} IN {self.authorized_parameter_values}\n"
|
||||
|
||||
|
||||
res += f"""
|
||||
|
180
src/querying.py
180
src/querying.py
@@ -1,26 +1,36 @@
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
from tprint import tprint
|
||||
|
||||
from joblib import Memory # for persistent memoïzation
|
||||
|
||||
from query_generator import *
|
||||
import orderankings as odrk
|
||||
import kemeny_young as km
|
||||
from joblib import Memory
|
||||
|
||||
import yaml # to load config file
|
||||
from os import environ # access environment variables
|
||||
|
||||
|
||||
# persistent memoïzation
|
||||
memory = Memory("cache")
|
||||
memory = Memory("src/cache")
|
||||
|
||||
DATABASE_NAME = "flight_delay"
|
||||
DATABASE_NAME = "SSB"
|
||||
VENV_PATH = environ.get('VIRTUAL_ENV')
|
||||
|
||||
with open(VENV_PATH + "/src/config.yaml") as config_file:
|
||||
cfg = yaml.load(config_file, Loader=yaml.Loader)
|
||||
|
||||
VERBOSE = cfg["verbose"]["querying"]
|
||||
|
||||
DATABASE_NAME = cfg["database_name"]
|
||||
if VERBOSE: print("using database", DATABASE_NAME)
|
||||
|
||||
|
||||
################################################################################
|
||||
# Connexion to sqlite database
|
||||
|
||||
odrk.VERBOSE = False
|
||||
VERBOSE = True
|
||||
|
||||
# initialize database connection
|
||||
DATABASE_FILE = f"../{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"
|
||||
DATABASE_FILE = f"{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"
|
||||
if VERBOSE: print(f"connecting to {DATABASE_FILE}")
|
||||
CON = sqlite3.connect(DATABASE_FILE)
|
||||
CUR = CON.cursor()
|
||||
@@ -39,10 +49,10 @@ def query(q: str) -> list[tuple]:
|
||||
|
||||
if DATABASE_NAME == "flight_delay":
|
||||
QUERY_PARAM_GB_FACTORY = QueryFlightWithParameterGroupedByCriteria
|
||||
QUERY_PARAM_FACTORY = QueryFlightWithParameter
|
||||
elif DATABASE_NAME == "SSB":
|
||||
QUERY_PARAM_GB_FACTORY = QuerySSBWithParameterGroupedByCriteria
|
||||
QUERY_PARAM_FACTORY = QuerySSBWithParameter
|
||||
else:
|
||||
raise ValueError(f"Unknown database : {DATABASE_NAME}")
|
||||
|
||||
################################################################################
|
||||
# orderings extraction functions
|
||||
@@ -50,7 +60,7 @@ elif DATABASE_NAME == "SSB":
|
||||
@memory.cache # persistent memoïzation
|
||||
def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str, ...],
|
||||
length: int,
|
||||
authorized_parameter_values: list[str] =None
|
||||
authorized_parameter_values: tuple[str, ...] | None =None
|
||||
) -> list[list[str]]:
|
||||
"""Gather the list of every ordering returned by queries using given values
|
||||
of parameter, summed_attribute, and all given values of criterion.
|
||||
@@ -65,15 +75,10 @@ def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str,
|
||||
"""
|
||||
# instanciate the query generator
|
||||
qg = QUERY_PARAM_GB_FACTORY(parameter=parameter,
|
||||
authorized_parameter_values=authorized_parameter_values,
|
||||
summed_attribute=summed_attribute,
|
||||
criteria=None)
|
||||
|
||||
if authorized_parameter_values is None:
|
||||
# reduce the number of compared parameter values
|
||||
qg.authorized_parameter_values = qg.authorized_parameter_values#[:length]
|
||||
else:
|
||||
qg.authorized_parameter_values = authorized_parameter_values#[:length]
|
||||
|
||||
# ensemble de tous les ordres trouvés
|
||||
# la clef est la valeur dans la colonne criteria
|
||||
orderings = list()
|
||||
@@ -104,145 +109,4 @@ def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str,
|
||||
return correct_length_orderings
|
||||
|
||||
|
||||
@memory.cache # persistent memoïzation
|
||||
def find_true_ordering_ranking(parameter: str,
|
||||
summed_attribute: str,
|
||||
length: int,
|
||||
authorized_parameter_values: tuple[str,...]|None =None
|
||||
) -> tuple[list[list[str]], list[list[int]]]:
|
||||
"""Return the true (ordering, ranking), considering the data as a whole (no
|
||||
grouping by), and getting the true order (no rankings aggregation)."""
|
||||
if authorized_parameter_values is None:
|
||||
qg = QUERY_PARAM_FACTORY(parameter=parameter,
|
||||
summed_attribute=summed_attribute)
|
||||
else:
|
||||
qg = QUERY_PARAM_FACTORY(parameter=parameter,
|
||||
summed_attribute=summed_attribute,
|
||||
authorized_parameter_values=authorized_parameter_values)
|
||||
# qg.authorized_parameter_values = qg.authorized_parameter_values[:length]
|
||||
res = query(str(qg))
|
||||
if VERBOSE: print(res)
|
||||
ordering = odrk.get_orderings_from_table(res)
|
||||
ranking = odrk.rankings_from_orderings([ordering])[0]
|
||||
return ordering, ranking
|
||||
|
||||
################################################################################
|
||||
def flight_delay_main():
|
||||
PARAMETER = "departure_airport"
|
||||
SUMMED_ATTRIBUTE = "nb_flights"
|
||||
LENGTH = 5
|
||||
|
||||
ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
length=LENGTH)
|
||||
print(ordering, ranking)
|
||||
|
||||
CRITERION = [
|
||||
# "airline",
|
||||
# "departure_hour",
|
||||
"day",
|
||||
# "month",
|
||||
]
|
||||
rng = np.random.default_rng()
|
||||
rng.shuffle(CRITERION)
|
||||
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH)
|
||||
# grouped_orderings = grouped_orderings[:5]
|
||||
# tprint(grouped_orderings, limit=20)
|
||||
print(grouped_orderings)
|
||||
# inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
|
||||
grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
|
||||
_, inferred_ranking = km.rank_aggregation(grouped_rankings)
|
||||
inferred_ranking = np.array(inferred_ranking)
|
||||
inferred_order = odrk.ordering_from_ranking(inferred_ranking,
|
||||
grouped_orderings[0])
|
||||
print("inferred :")
|
||||
print(inferred_order, inferred_ranking)
|
||||
|
||||
# print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
|
||||
|
||||
################################################################################
|
||||
def SSB_main():
|
||||
PARAMETER = "p_color"
|
||||
SUMMED_ATTRIBUTE = "lo_quantity"
|
||||
# SUMMED_ATTRIBUTE = "lo_revenue"
|
||||
# SUMMED_ATTRIBUTE = "lo_extendedprice"
|
||||
LENGTH = 2
|
||||
|
||||
CRITERION = (
|
||||
##### customer table
|
||||
"c_region",
|
||||
"c_city",
|
||||
"c_nation",
|
||||
|
||||
##### part table
|
||||
"p_category",
|
||||
"p_brand",
|
||||
"p_mfgr",
|
||||
"p_color",
|
||||
"p_type",
|
||||
"p_container",
|
||||
|
||||
##### supplier table
|
||||
"s_city",
|
||||
"s_nation",
|
||||
"s_region",
|
||||
|
||||
##### order date
|
||||
# "D_DATE",
|
||||
# "D_DATEKEY",
|
||||
# "D_DATE",
|
||||
# "D_DAYOFWEEK",
|
||||
# "D_MONTH",
|
||||
# "D_YEAR",
|
||||
# "D_YEARMONTHNUM",
|
||||
# "D_YEARMONTH",
|
||||
# "D_DAYNUMINWEEK"
|
||||
# "D_DAYNUMINMONTH",
|
||||
# "D_DAYNUMINYEAR",
|
||||
# "D_MONTHNUMINYEAR",
|
||||
"D_WEEKNUMINYEAR",
|
||||
# "D_SELLINGSEASON",
|
||||
# "D_LASTDAYINWEEKFL",
|
||||
# "D_LASTDAYINMONTHFL",
|
||||
# "D_HOLIDAYFL",
|
||||
# "D_WEEKDAYFL",
|
||||
)
|
||||
|
||||
HYPOTHESIS_ORDERING = ("aquamarine", "dark")
|
||||
|
||||
ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
length=LENGTH,
|
||||
authorized_parameter_values=HYPOTHESIS_ORDERING)
|
||||
print(ordering, ranking)
|
||||
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH
|
||||
)
|
||||
|
||||
# grouped_orderings = grouped_orderings[:5]
|
||||
tprint(grouped_orderings, limit=20)
|
||||
# print(grouped_orderings)
|
||||
# inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
|
||||
grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
|
||||
_, inferred_ranking = km.rank_aggregation(grouped_rankings)
|
||||
inferred_ranking = np.array(inferred_ranking)
|
||||
inferred_order = odrk.ordering_from_ranking(inferred_ranking,
|
||||
grouped_orderings[0])
|
||||
print("inferred :")
|
||||
print(inferred_order, inferred_ranking)
|
||||
|
||||
# print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if DATABASE_NAME == "SSB":
|
||||
SSB_main()
|
||||
elif DATABASE_NAME == "flight_delay":
|
||||
flight_delay_main()
|
||||
|
||||
|
Reference in New Issue
Block a user