Better gestion of privates variables.

Add a forgotten f for f-strings.
Use the config file instead of global variables.
2024-06-27 17:24:33 +02:00 · 2024-06-27 17:24:04 +02:00 · 2024-06-27 17:23:19 +02:00 · 2024-06-27 17:23:05 +02:00 · 2024-06-27 17:22:08 +02:00 · 2024-06-27 17:21:33 +02:00
7 changed files with 57 additions and 598 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@ bin/*
 include/*
 lib/*
 share/*
-pyenv.cfg
+pyvenv.cfg
 src/__pycache__/*
 .DS_Store
 modal_losses N=1000 nb_orders=10000 granularity=1.png
--- a/9
+++ b/9
@@ -5,12 +5,15 @@ DATABASE_FILE=${DATABASE_FOLDER}/${DATABASE_NAME}.db

 all: execute-script

-execute-script:
-	python3 concentration_test.py
+execute-script: requirements.txt
+	source bin/activate; \
+	python3 src/concentration_test.py; \

-pip-install:
+requirements.txt:
 	bin/pip3 install -r requirements.txt

+# run o
+
 reset: delete-database import-from-csv

 open:
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ fastcache
 tqdm
 joblib
 scipy
+PyYAML
--- a/src/concentration_test.py
+++ b/src/concentration_test.py
@@ -1,425 +0,0 @@
-import matplotlib.pyplot as plt
-from matplotlib.colors import CSS4_COLORS
-import numpy as np
-from scipy.stats import norm as Norm, beta as Beta, t as Student
-from tprint import tprint
-import orderankings as odrk
-from querying import find_orderings
-from kemeny_young import kendall_tau_dist, rank_aggregation
-from tqdm import tqdm
-from collections import Counter, defaultdict
-import joblib
-from functools import partial
-import random
-
-# Random number generator for the whole program
-RNG = np.random.default_rng(1234)
-
-VERBOSE = True
-VERBOSE = False
-
-
-################## DATA SETTINGS (parameters, hypothesis...) ###################
-
-# """ comment this line when using the SSB dataset
-# SSB dataset settings  # {{{
-
-PARAMETER = "p_color"
-SUMMED_ATTRIBUTE = "lo_quantity"
-# SUMMED_ATTRIBUTE = "lo_revenue"
-# SUMMED_ATTRIBUTE = "lo_extendedprice"
-LENGTH = 2
-
-authorized_parameter_values = {
-        "p_size": tuple(map(int, range(50))),
-        "p_color": tuple(CSS4_COLORS.keys()),
-        }
-AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER]
-
-CRITERION = (
-        ##### customer table
-        # "c_region",
-        "c_city",
-        # "c_nation",
-
-        ##### part table
-        "p_category",
-        "p_brand",
-        # "p_mfgr",
-        # "p_color",
-        # "p_type",
-        # "p_container",
-
-        ##### supplier table
-        "s_city",
-        # "s_nation",
-        # "s_region",
-
-        ##### order date
-        # "D_DATE",
-        # "D_DATEKEY",
-        # "D_DATE",
-        # "D_DAYOFWEEK",
-        # "D_MONTH",
-        # "D_YEAR",
-        # "D_YEARMONTHNUM",
-        # "D_YEARMONTH",
-        # "D_DAYNUMINWEEK"
-        # "D_DAYNUMINMONTH",
-        # "D_DAYNUMINYEAR",
-        # "D_MONTHNUMINYEAR",
-        # "D_WEEKNUMINYEAR",
-        # "D_SELLINGSEASON",
-        # "D_LASTDAYINWEEKFL",
-        # "D_LASTDAYINMONTHFL",
-        # "D_HOLIDAYFL",
-        # "D_WEEKDAYFL",
-    )
-
-HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
-HYPOTHESIS_ORDERING = ("bisque", "blue")
-
-# HYPOTHESIS_ORDERING = [2, 32]
-# HYPOTHESIS_ORDERING = [30, 18]
-# HYPOTHESIS_ORDERING = [37, 49, 10]
-
-
-# }}}
-""" # flight_delay dataset settings {{{
-
-PARAMETER = "departure_airport"
-SUMMED_ATTRIBUTE = "nb_flights"
-LENGTH = 3
-
-CRITERION = (
-        # "airline",
-        "departure_hour",  # simpson's paradox ?
-        # "day",
-        # "month",
-        # "year",
-        )
-
-
-GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS',
-                   'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA',
-                   'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK']
-AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING
-
-
-# Correct hypothesis for each length (so the loss converges to 0)
-CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING)
-CORRECT_ORDERINGS[2] = ['ATL', 'DEN']
-CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD']
-CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD']
-CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX']
-# now select the right one according to the LENGTH
-CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH]
-
-# Use the correct ordering
-HYPOTHESIS_ORDERING = CORRECT_ORDERING
-print(HYPOTHESIS_ORDERING)
-
-
-# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX']
-# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH]
-# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH]
-# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX']  # interesting loss curve
-
-assert len(HYPOTHESIS_ORDERING) == LENGTH
-
-# }}}
-# """
-
-
-def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
-    """This loss is the the average of kendall tau distances between the truth
-    and each ordering."""
-    rankings = odrk.rankings_from_orderings(orderings)
-    true_ranking = odrk.rankings_from_orderings([truth])[0]
-    return rankings_average_loss(rankings, true_ranking)# }}}
-
-
-def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
-    distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
-    length = len(rankings)
-    # apparently, this is what works for a good normalization
-    return distance / length
-    # return distance * 2 / (length * (length - 1))}}}
-
-
-def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
-    """Return the kendall tau distance between the truth and the kemeny-young
-    aggregation of orderings"""
-    _, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
-    aggregation = odrk.ordering_from_ranking(agg_rank, truth)
-    loss = kendall_tau_dist(
-            odrk.ranking_from_ordering(aggregation),
-            odrk.ranking_from_ordering(truth))
-    return loss
-    # print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
-
-
-def get_loss_progression(): # {{{
-    grouped_orderings = find_orderings(parameter=PARAMETER,
-                                        summed_attribute=SUMMED_ATTRIBUTE,
-                                        criterion=CRITERION,
-                                        length=LENGTH)
-    RNG.shuffle(grouped_orderings)
-
-    average_losses = []
-    kendal_aggregation_losses = []
-
-    for nb_considered_orderings in range(1, len(grouped_orderings)+1):
-        # loss as the average distance from truth to all considered orderings
-        considered_orderings = grouped_orderings[:nb_considered_orderings]
-        loss = orderings_average_loss(orderings=considered_orderings,
-                                      truth=HYPOTHESIS_ORDERING)
-
-        # loss as the distance between truth and the aggregation
-        kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
-                                      truth=HYPOTHESIS_ORDERING)
-        kendal_aggregation_losses.append(kdl_agg_loss)
-
-        if VERBOSE:
-            print(f"using {nb_considered_orderings} orderings")
-            tprint(considered_orderings)
-            print("truth :", HYPOTHESIS_ORDERING)
-            print("loss =", loss)
-        average_losses.append(loss)
-    return average_losses, kendal_aggregation_losses
-    # }}}
-
-def plot_loss_progression(): # {{{
-    """Plot the progression of losses when using more and more of the values
-    (see get_loss_progression)."""
-    N = 20
-
-    avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
-    avg_loss_progression = np.array(avg_loss_progression)
-    kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
-
-    for _ in tqdm(range(N-1), leave=False):
-        avg_lp, kmny_lp = get_loss_progression()
-        avg_loss_progression += avg_lp
-        kdl_agg_loss_progression += kmny_lp
-        # print(progression)
-    if VERBOSE:
-        print(avg_loss_progression)
-        print(kdl_agg_loss_progression)
-    plt.plot(avg_loss_progression, color="orange")
-    plt.plot(kdl_agg_loss_progression, color="green")
-    # }}}
-
-def get_mode_loss_progression(all_orderings: list[list[str]],
-                              number_of_steps: int,
-                              orders_added_each_step: int =1) -> list[bool]:
-
-    # all_rankings = odrk.rankings_from_orderings(all_orderings)
-
-    # considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
-    considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
-    # count occurrences of each ordering
-    orderings_count = Counter(map(tuple, considered_orderings))
-
-    # loss progression when adding more and more orderings
-    loss_history = np.zeros(number_of_steps)
-
-    # # random permutation of the orderings
-    # permuted_orderings = np.random.permutation(all_orderings)
-
-    for idx in range(number_of_steps):
-        # new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
-        new_orders = random.choices(all_orderings, k=orders_added_each_step)
-        # new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
-
-        # considered_orderings.extend(new_orders)
-        # update the counter of orderings occurrences
-        orderings_count.update(Counter(map(tuple, new_orders)))
-        # the most common (modal) ordering
-        modal_ordering = orderings_count.most_common()[0][0]
-        modal_ordering = np.array(modal_ordering)
-        # if VERBOSE: print(modal_ordering)
-        # the loss is 1 if the modal ordering is the same as the hypothesis
-        loss =  int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
-        # loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
-        # loss = int(all(map(lambda x: x[0]==x[1],
-        #                    zip(modal_ordering, HYPOTHESIS_ORDERING))))
-        # add loss to the list of losses
-        loss_history[idx] = loss
-    if VERBOSE:
-        # print(loss_history, HYPOTHESIS_ORDERING)
-        print(orderings_count.most_common(1)[0])
-    return np.repeat(loss_history, orders_added_each_step)
-
-
-################################################################################
-
-def plot_modal_losses():
-    ###################
-    # sampling settings
-    N = 100  # number of repetitions of the experiment
-    max_number_of_orders = 7500  # max sample size
-    GRANULARITY = 12  # granularity of the sampling (orders by iteration)
-
-    number_of_steps = max_number_of_orders // GRANULARITY
-
-    all_orderings = find_orderings(
-            parameter=PARAMETER,
-            summed_attribute=SUMMED_ATTRIBUTE,
-            criterion=CRITERION,
-            length=LENGTH,
-            authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
-
-    print(f"there are {all_orderings.size} orders in total :")
-    tprint(all_orderings, limit=10)
-
-
-    # make get_mode_loss_progression parallelizable
-    gmlp = joblib.delayed(get_mode_loss_progression)
-
-    ####
-    # Aggregate multiple simulations
-
-    # don't use the tqdm progress bar if there are some logs
-    range_N = range(N) if VERBOSE else tqdm(range(N))
-
-    # for my 8-core computer, n_jobs=7 is empirically the best value
-    loss_history = joblib.Parallel(n_jobs=7)(
-            gmlp(all_orderings,
-                 number_of_steps,
-                 orders_added_each_step=GRANULARITY)
-            for _ in range_N
-            )
-    loss_history = np.array(loss_history)
-
-    # the sum of losses for each number of steps
-    losses = np.sum(loss_history, axis=0)
-
-    if VERBOSE: print("losses :", losses, sep="\n")
-
-    #####
-    # average
-    # since losses is the sum of losses, losses/N is the average
-    mean = losses / N
-    plt.plot(mean, color="green", label="loss average")
-
-    #####
-    # standard deviation
-    # variance is (average of squares) - (square of the average)
-    # since we only have 1 or 0, average of squares is just the average
-    # so the variance is average - average**2
-    # stddev is the square root of variance
-    stddev = np.sqrt(mean - mean**2)
-    plt.plot(stddev, color="grey", label="loss standard deviation")
-
-
-
-    ############################################################################
-    # CONFIDENCE INTERVALS
-
-    X = np.arange(mean.size)  # the x axis
-
-    ######
-    ## confidence interval
-    ## assuming the experimental variance is the correct one
-    #confidence = 0.95
-    #alpha = 1 - confidence
-    #eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
-    #epsilon = eta * stddev / np.sqrt(N)
-    #plt.fill_between(X, mean - epsilon, mean + epsilon,
-    #                 color="blue", alpha=0.25,
-    #                 label=f"{100*confidence}% confidence interval")
-
-    #####
-    # confidence interval
-    # assuming each summed distribution is a normal distribution
-    confidence = 0.999999
-    delta = 1 - confidence
-
-    # corrected sample variance
-    S = np.sqrt((1 / N-1) * (mean - mean**2))
-
-    eta = Student(df=N-1).ppf(1 - delta/2)
-    epsilon = eta * stddev / np.sqrt(N)
-    plt.fill_between(X, mean - epsilon, mean + epsilon,
-                     color="green", alpha=0.2,
-                     label=f"{100*confidence}% confidence interval")
-
-    # confidence = 0.95
-    # delta = 1 - confidence
-    # eta = Student(df=X-1).ppf(1 - delta/2)
-    # epsilon = eta * stddev / np.sqrt(X)
-    # plt.fill_between(X, mean - epsilon, mean + epsilon,
-    #                  color="green", alpha=0.5,
-    #                  label=f"{100*confidence}% confidence interval")
-
-    ######
-    ## beta distribution
-    ## confidence = 0.95
-    #delta = 1 - confidence
-    #alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
-    #beta = np.cumsum(loss_history, axis=1).mean(axis=0)
-    #epsilon = Beta.ppf(1 - delta/2, alpha, beta)
-    #plt.fill_between(X, mean - epsilon, mean + epsilon,
-    #                 color="orange", alpha=0.30,
-    #                 label=f"{100*confidence} β confidence interval")
-
-
-    ######
-    ## fluctuation interval
-    #confidence = 0.1
-    #alpha = 1-confidence
-    #k = Norm.ppf(alpha/2, loc=0, scale=1)
-    #fluctuation = k * stddev
-    #plt.fill_between(X, mean - fluctuation, mean + fluctuation,
-    #                 color="orange", alpha=0.25,
-    #                 label=f"{100*confidence}% fluctuation interval")
-
-    ######
-    ## hoeffding
-    #t = 0.9999999
-    #plt.plot(X, 2 * np.exp(-2 * t ** 2 / X),
-    #         color="red")
-
-    ######
-    ## y = 1/2
-    #plt.plot([0, mean.size], [0.5, 0.5],
-    #         color="orange", alpha=0.25)
-
-if __name__ == '__main__':
-    rankings = np.array([[1, 3, 2, 4],
-                         [3, 4, 2, 1],
-                         [1, 2, 3, 4],
-                         [1, 3, 2, 4],
-                         [2, 3, 1, 4],
-                         [1, 3, 2, 1],
-                         [2, 3, 1, 4],
-                         [2, 3, 1, 4]])
-
-    # all_orderings = find_orderings(parameter=PARAMETER,
-    #                                summed_attribute=SUMMED_ATTRIBUTE,
-    #                                criterion=CRITERION,
-    #                                length=LENGTH)
-    # # print(all_orderings)
-    # print(f"There are {len(all_orderings)} orderings in `all_orderings`")
-
-    # for _ in range(20):
-    #     dep = time()
-    #     plot_modal_losses()
-    #     print(round(time()-dep, 4))
-
-    plt.style.use('dark_background')
-
-    # HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
-    # plot_modal_losses()
-    HYPOTHESIS_ORDERING = ("bisque", "blue")
-    plot_modal_losses()
-    plt.legend()
-
-    ax = plt.gca()
-    # ax.set_ylim([0, 1])
-
-    # plt.ion()
-    plt.show()
-
-
--- a/src/kemeny_young.py
+++ b/src/kemeny_young.py
@@ -1,3 +1,7 @@
+"""
+This Module defines functions to compute the kendall tau distance between two
+rankings, and the kemeny-young rank aggregation method.
+"""
 import numpy as np
 from numba import jit, njit
 from itertools import permutations
@@ -18,7 +22,6 @@ Number = int|float



-
 def kendall_tau_dist(ranking_a: list[int], ranking_b: list[int]) -> Number:
    """The kendall τ distance between two rankings / permutations.
    It is the number of inversions that don't have the same sign within all pairs of an inversion of ranking_a and an inversion of ranking_b.
@@ -42,9 +45,9 @@ def __tau(A: list[int], B: list[int]) -> int:


 def rank_aggregation(rankings: list[list[int]]) -> tuple[int, tuple[int, ...]]:
-    """Brute-force kemeny-young rank aggregation.
+    """Return the order elected by the kemeny-young method.
    Args:
-        ranks: A list of the ranks (2D numpy array).
+        ranks: A list of the ranks (2D numpy array) to elect from.
    Returns:
        int, list: The minimal sum of distances to ranks, the rank of minimal distance.
    """
@@ -67,6 +70,9 @@ def rank_aggregation(rankings: list[list[int]]) -> tuple[int, tuple[int, ...]]:
    return min_dist, best_ranking


+
+#################################### TESTS #####################################
+
 if __name__ == '__main__':
    ranks = np.array([[0, 1, 2, 3, 4],
                      [0, 1, 3, 2, 4],
--- a/src/query_generator.py
+++ b/src/query_generator.py
@@ -7,6 +7,7 @@ from abc import ABC, abstractmethod


 class QueryGenerator(ABC):
+    """Abstract class to define what methods should a query generator have."""
    @abstractmethod
    def __init__(self): ...

@@ -14,18 +15,23 @@ class QueryGenerator(ABC):
    def __str__(self) -> str: ...


-class QueryWithParameter(QueryGenerator):
-    # DEFAULT_AUTHORIZED_PARAMETER_VALUES: tuple[str, ...] = ("foo", "bar")
+class QueryWithParameter(QueryGenerator, ABC):
+    """Abstract class for query generators with our 3 parameters.
+    This class implements the gestion of 3 attributes : `parameter`,
+    `authorized_parameter_values` and `summed_attribute`. They are managed so
+    that there is no typing error, and using default values. Importantly, the
+    default value of authorized_parameter_values (when not given or set to
+    None) is the the value of `self.DEFAULT_AUTHORIZED_PARAMETER_VALUES`.
+    """

    def __init__(self, parameter: str|None =None,
                 authorized_parameter_values: tuple[str, ...] | None = None,
                 summed_attribute: str|None =None):
        if parameter is None: raise ValueError
-        self.parameter = str(parameter)
+        self.__parameter = str(parameter)

-        if authorized_parameter_values is None:
-            authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
-        self.authorized_parameter_values = authorized_parameter_values
+        self.__authorized_parameter_values = authorized_parameter_values
+        self.__force_typing_on_authorized_parameter_values()

        if summed_attribute is None: raise ValueError
        self.summed_attribute = str(summed_attribute)
@@ -39,6 +45,8 @@ class QueryWithParameter(QueryGenerator):
        self.__parameter = str(value)

    def __force_typing_on_authorized_parameter_values(self):
+        if self.__authorized_parameter_values is None:
+            self.__authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
        self.__authorized_parameter_values = tuple(
                map(str, self.__authorized_parameter_values))

@@ -54,6 +62,8 @@ class QueryWithParameter(QueryGenerator):


 class QueryWithParameterGroupedByCriteria(QueryWithParameter):
+    """Similar to QueryWithParameter, but with an addtional parameter : `criteria`.
+    The results are grouped by criteria, and values of `summed_attribute` are summed for each `parameter`, to give an order on `parameter`'s values"""

    def __init__(self, parameter: str|None =None,
                 authorized_parameter_values: tuple[str, ...] | None =None,
@@ -67,7 +77,7 @@ class QueryWithParameterGroupedByCriteria(QueryWithParameter):
            authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
        self.authorized_parameter_values = authorized_parameter_values

-        self.criteria = criteria
+        self.__criteria = str(criteria)

        if summed_attribute is None: raise ValueError
        self.summed_attribute = str(summed_attribute)
@@ -162,7 +172,7 @@ class QuerySSBWithParameterGroupedByCriteria(QueryWithParameterGroupedByCriteria
            res += "INNER JOIN date ON lo_orderdate = D_DATEKEY\n"

        if self.authorized_parameter_values is not None:
-            res += "WHERE {self.parameter} IN {self.authorized_parameter_values}\n"
+            res += f"WHERE {self.parameter} IN {self.authorized_parameter_values}\n"


        res += f"""
--- a/src/querying.py
+++ b/src/querying.py
@@ -1,26 +1,36 @@
 import sqlite3
 import numpy as np
 from tprint import tprint
+
+from joblib import Memory  # for persistent memoïzation
+
 from query_generator import *
 import orderankings as odrk
 import kemeny_young as km
-from joblib import Memory
+
+import yaml  # to load config file
+from os import environ  # access environment variables
+

 # persistent memoïzation
-memory = Memory("cache")
+memory = Memory("src/cache")

-DATABASE_NAME = "flight_delay"
-DATABASE_NAME = "SSB"
+VENV_PATH = environ.get('VIRTUAL_ENV')
+
+with open(VENV_PATH + "/src/config.yaml") as config_file:
+    cfg = yaml.load(config_file, Loader=yaml.Loader)
+
+VERBOSE = cfg["verbose"]["querying"]
+
+DATABASE_NAME = cfg["database_name"]
+if VERBOSE: print("using database", DATABASE_NAME)


 ################################################################################
 # Connexion to sqlite database

-odrk.VERBOSE = False
-VERBOSE = True
-
 # initialize database connection
-DATABASE_FILE = f"../{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"
+DATABASE_FILE = f"{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"
 if VERBOSE: print(f"connecting to {DATABASE_FILE}")
 CON = sqlite3.connect(DATABASE_FILE)
 CUR = CON.cursor()
@@ -39,10 +49,10 @@ def query(q: str) -> list[tuple]:

 if DATABASE_NAME == "flight_delay":
    QUERY_PARAM_GB_FACTORY = QueryFlightWithParameterGroupedByCriteria
-    QUERY_PARAM_FACTORY = QueryFlightWithParameter
 elif DATABASE_NAME == "SSB":
    QUERY_PARAM_GB_FACTORY = QuerySSBWithParameterGroupedByCriteria
-    QUERY_PARAM_FACTORY = QuerySSBWithParameter
+else:
+    raise ValueError(f"Unknown database : {DATABASE_NAME}")

 ################################################################################
 # orderings extraction functions
@@ -50,7 +60,7 @@ elif DATABASE_NAME == "SSB":
@memory.cache  # persistent memoïzation
 def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str, ...],
                   length: int,
-                   authorized_parameter_values: list[str] =None
+                   authorized_parameter_values: tuple[str, ...] | None =None
                   ) -> list[list[str]]:
    """Gather the list of every ordering returned by queries using given values
    of parameter, summed_attribute, and all given values of criterion.
@@ -65,15 +75,10 @@ def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str,
    """
    # instanciate the query generator
    qg = QUERY_PARAM_GB_FACTORY(parameter=parameter,
+                                authorized_parameter_values=authorized_parameter_values,
                                summed_attribute=summed_attribute,
                                criteria=None)

-    if authorized_parameter_values is None:
-        # reduce the number of compared parameter values
-        qg.authorized_parameter_values = qg.authorized_parameter_values#[:length]
-    else:
-        qg.authorized_parameter_values = authorized_parameter_values#[:length]
-
    # ensemble de tous les ordres trouvés
    # la clef est la valeur dans la colonne criteria
    orderings = list()
@@ -104,145 +109,4 @@ def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str,
    return correct_length_orderings


-@memory.cache  # persistent memoïzation
-def find_true_ordering_ranking(parameter: str,
-                               summed_attribute: str,
-                               length: int,
-                               authorized_parameter_values: tuple[str,...]|None =None
-                               ) -> tuple[list[list[str]], list[list[int]]]:
-    """Return the true (ordering, ranking), considering the data as a whole (no
-    grouping by), and getting the true order (no rankings aggregation)."""
-    if authorized_parameter_values is None:
-        qg = QUERY_PARAM_FACTORY(parameter=parameter,
-                                 summed_attribute=summed_attribute)
-    else:
-        qg = QUERY_PARAM_FACTORY(parameter=parameter,
-                                 summed_attribute=summed_attribute,
-                                 authorized_parameter_values=authorized_parameter_values)
-    # qg.authorized_parameter_values = qg.authorized_parameter_values[:length]
-    res = query(str(qg))
-    if VERBOSE: print(res)
-    ordering = odrk.get_orderings_from_table(res)
-    ranking = odrk.rankings_from_orderings([ordering])[0]
-    return ordering, ranking
-
-################################################################################
-def flight_delay_main():
-    PARAMETER = "departure_airport"
-    SUMMED_ATTRIBUTE = "nb_flights"
-    LENGTH = 5
-
-    ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
-                                                   summed_attribute=SUMMED_ATTRIBUTE,
-                                                   length=LENGTH)
-    print(ordering, ranking)
-
-    CRITERION = [
-        # "airline",
-        # "departure_hour",
-        "day",
-        # "month",
-    ]
-    rng = np.random.default_rng()
-    rng.shuffle(CRITERION)
-
-    grouped_orderings = find_orderings(parameter=PARAMETER,
-                                       summed_attribute=SUMMED_ATTRIBUTE,
-                                       criterion=CRITERION,
-                                       length=LENGTH)
-    # grouped_orderings = grouped_orderings[:5]
-    # tprint(grouped_orderings, limit=20)
-    print(grouped_orderings)
-    # inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
-    grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
-    _, inferred_ranking = km.rank_aggregation(grouped_rankings)
-    inferred_ranking = np.array(inferred_ranking)
-    inferred_order = odrk.ordering_from_ranking(inferred_ranking,
-                                                grouped_orderings[0])
-    print("inferred :")
-    print(inferred_order, inferred_ranking)
-
-    # print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
-
-################################################################################
-def SSB_main():
-    PARAMETER = "p_color"
-    SUMMED_ATTRIBUTE = "lo_quantity"
-    # SUMMED_ATTRIBUTE = "lo_revenue"
-    # SUMMED_ATTRIBUTE = "lo_extendedprice"
-    LENGTH = 2
-
-    CRITERION = (
-            ##### customer table
-            "c_region",
-            "c_city",
-            "c_nation",
-
-            ##### part table
-            "p_category",
-            "p_brand",
-            "p_mfgr",
-            "p_color",
-            "p_type",
-            "p_container",
-
-            ##### supplier table
-            "s_city",
-            "s_nation",
-            "s_region",
-
-            ##### order date
-            # "D_DATE",
-            # "D_DATEKEY",
-            # "D_DATE",
-            # "D_DAYOFWEEK",
-            # "D_MONTH",
-            # "D_YEAR",
-            # "D_YEARMONTHNUM",
-            # "D_YEARMONTH",
-            # "D_DAYNUMINWEEK"
-            # "D_DAYNUMINMONTH",
-            # "D_DAYNUMINYEAR",
-            # "D_MONTHNUMINYEAR",
-            "D_WEEKNUMINYEAR",
-            # "D_SELLINGSEASON",
-            # "D_LASTDAYINWEEKFL",
-            # "D_LASTDAYINMONTHFL",
-            # "D_HOLIDAYFL",
-            # "D_WEEKDAYFL",
-        )
-
-    HYPOTHESIS_ORDERING = ("aquamarine", "dark")
-
-    ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
-                                                   summed_attribute=SUMMED_ATTRIBUTE,
-                                                   length=LENGTH,
-                                                   authorized_parameter_values=HYPOTHESIS_ORDERING)
-    print(ordering, ranking)
-
-    grouped_orderings = find_orderings(parameter=PARAMETER,
-                                       summed_attribute=SUMMED_ATTRIBUTE,
-                                       criterion=CRITERION,
-                                       length=LENGTH
-                                       )
-
-    # grouped_orderings = grouped_orderings[:5]
-    tprint(grouped_orderings, limit=20)
-    # print(grouped_orderings)
-    # inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
-    grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
-    _, inferred_ranking = km.rank_aggregation(grouped_rankings)
-    inferred_ranking = np.array(inferred_ranking)
-    inferred_order = odrk.ordering_from_ranking(inferred_ranking,
-                                                grouped_orderings[0])
-    print("inferred :")
-    print(inferred_order, inferred_ranking)
-
-    # print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
-
-if __name__ == '__main__':
-    if DATABASE_NAME == "SSB":
-        SSB_main()
-    elif DATABASE_NAME == "flight_delay":
-        flight_delay_main()
Author	SHA1	Message	Date
Oscar Plaisant	f84aec4456	Better gestion of privates variables. Add a forgotten f for f-strings.	2024-06-27 17:24:33 +02:00
Oscar Plaisant	b13f8ab039	Use the config file instead of global variables.	2024-06-27 17:24:04 +02:00
Oscar Plaisant	ff0f646d04	modify documentation	2024-06-27 17:23:19 +02:00
Oscar Plaisant	0835eff420	moved into old_concentration_test.py	2024-06-27 17:23:05 +02:00
Oscar Plaisant	87ae317c31	correct mistake	2024-06-27 17:22:08 +02:00
Oscar Plaisant	6a88756e84	add target to execute project and to update requirements if needed.	2024-06-27 17:21:33 +02:00
Oscar Plaisant	be6cbf46c6	add PyYAML (in order to load config)	2024-06-27 17:20:41 +02:00