initial commit
This commit is contained in:
248
src/querying.py
Normal file
248
src/querying.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
from tprint import tprint
|
||||
from query_generator import *
|
||||
import orderankings as odrk
|
||||
import kemeny_young as km
|
||||
from joblib import Memory
|
||||
|
||||
# persistent memoïzation
|
||||
memory = Memory("cache")
|
||||
|
||||
DATABASE_NAME = "flight_delay"
|
||||
DATABASE_NAME = "SSB"
|
||||
|
||||
|
||||
################################################################################
|
||||
# Connexion to sqlite database
|
||||
|
||||
odrk.VERBOSE = False
|
||||
VERBOSE = True
|
||||
|
||||
# initialize database connection
|
||||
DATABASE_FILE = f"../{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"
|
||||
if VERBOSE: print(f"connecting to {DATABASE_FILE}")
|
||||
CON = sqlite3.connect(DATABASE_FILE)
|
||||
CUR = CON.cursor()
|
||||
|
||||
|
||||
@memory.cache # persistent memoïzation
|
||||
def query(q: str) -> list[tuple]:
|
||||
"""Execute a given query and reture the result in a python list[tuple]."""
|
||||
if VERBOSE: print(f'sending query : {q}')
|
||||
res = CUR.execute(str(q))
|
||||
if VERBOSE: print("got response", res)
|
||||
return res.fetchall()
|
||||
|
||||
################################################################################
|
||||
# Choice of the right query generator
|
||||
|
||||
if DATABASE_NAME == "flight_delay":
|
||||
QUERY_PARAM_GB_FACTORY = QueryFlightWithParameterGroupedByCriteria
|
||||
QUERY_PARAM_FACTORY = QueryFlightWithParameter
|
||||
elif DATABASE_NAME == "SSB":
|
||||
QUERY_PARAM_GB_FACTORY = QuerySSBWithParameterGroupedByCriteria
|
||||
QUERY_PARAM_FACTORY = QuerySSBWithParameter
|
||||
|
||||
################################################################################
|
||||
# orderings extraction functions
|
||||
|
||||
@memory.cache # persistent memoïzation
|
||||
def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str, ...],
|
||||
length: int,
|
||||
authorized_parameter_values: list[str] =None
|
||||
) -> list[list[str]]:
|
||||
"""Gather the list of every ordering returned by queries using given values
|
||||
of parameter, summed_attribute, and all given values of criterion.
|
||||
Args:
|
||||
parameter (str): The value of the parameter attribute in the query generator.
|
||||
summed_attribute (str): The attribute that you wan to sum in order to sort the values.
|
||||
criterion (tuple[str]): The list of attributes that you want to group the query by.
|
||||
length (int): The length of orderings, hence the number of different
|
||||
values of parameter that you consider in the query.
|
||||
Returns:
|
||||
list[list]: The list of all found orderings.
|
||||
"""
|
||||
# instanciate the query generator
|
||||
qg = QUERY_PARAM_GB_FACTORY(parameter=parameter,
|
||||
summed_attribute=summed_attribute,
|
||||
criteria=None)
|
||||
|
||||
if authorized_parameter_values is None:
|
||||
# reduce the number of compared parameter values
|
||||
qg.authorized_parameter_values = qg.authorized_parameter_values#[:length]
|
||||
else:
|
||||
qg.authorized_parameter_values = authorized_parameter_values#[:length]
|
||||
|
||||
# ensemble de tous les ordres trouvés
|
||||
# la clef est la valeur dans la colonne criteria
|
||||
orderings = list()
|
||||
|
||||
for criteria in criterion:
|
||||
qg.criteria = criteria
|
||||
# if VERBOSE: print(repr(QG))
|
||||
table = query(str(qg))
|
||||
if VERBOSE:
|
||||
print(f"request result with criteria '{criteria}' :")
|
||||
tprint(table, limit=10)
|
||||
table_orders = odrk.get_all_orderings_from_table(table)
|
||||
# pprint(table_orders, compact=True, width=1000)
|
||||
# update the global list of all found orders
|
||||
orderings.extend(table_orders.values())
|
||||
|
||||
# keep only orders that are of the specified length
|
||||
# that means removing too short ones, and slicing too long ones
|
||||
correct_length_orderings = np.array(
|
||||
[ordrng[:length] for ordrng in orderings if len(ordrng) >= length]
|
||||
)
|
||||
|
||||
if VERBOSE:
|
||||
print(f"found {len(correct_length_orderings)} orderings :")
|
||||
print(correct_length_orderings)
|
||||
# tprint(correct_length_orderings)
|
||||
|
||||
return correct_length_orderings
|
||||
|
||||
|
||||
@memory.cache # persistent memoïzation
|
||||
def find_true_ordering_ranking(parameter: str,
|
||||
summed_attribute: str,
|
||||
length: int,
|
||||
authorized_parameter_values: tuple[str,...]|None =None
|
||||
) -> tuple[list[list[str]], list[list[int]]]:
|
||||
"""Return the true (ordering, ranking), considering the data as a whole (no
|
||||
grouping by), and getting the true order (no rankings aggregation)."""
|
||||
if authorized_parameter_values is None:
|
||||
qg = QUERY_PARAM_FACTORY(parameter=parameter,
|
||||
summed_attribute=summed_attribute)
|
||||
else:
|
||||
qg = QUERY_PARAM_FACTORY(parameter=parameter,
|
||||
summed_attribute=summed_attribute,
|
||||
authorized_parameter_values=authorized_parameter_values)
|
||||
# qg.authorized_parameter_values = qg.authorized_parameter_values[:length]
|
||||
res = query(str(qg))
|
||||
if VERBOSE: print(res)
|
||||
ordering = odrk.get_orderings_from_table(res)
|
||||
ranking = odrk.rankings_from_orderings([ordering])[0]
|
||||
return ordering, ranking
|
||||
|
||||
################################################################################
|
||||
def flight_delay_main():
|
||||
PARAMETER = "departure_airport"
|
||||
SUMMED_ATTRIBUTE = "nb_flights"
|
||||
LENGTH = 5
|
||||
|
||||
ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
length=LENGTH)
|
||||
print(ordering, ranking)
|
||||
|
||||
CRITERION = [
|
||||
# "airline",
|
||||
# "departure_hour",
|
||||
"day",
|
||||
# "month",
|
||||
]
|
||||
rng = np.random.default_rng()
|
||||
rng.shuffle(CRITERION)
|
||||
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH)
|
||||
# grouped_orderings = grouped_orderings[:5]
|
||||
# tprint(grouped_orderings, limit=20)
|
||||
print(grouped_orderings)
|
||||
# inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
|
||||
grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
|
||||
_, inferred_ranking = km.rank_aggregation(grouped_rankings)
|
||||
inferred_ranking = np.array(inferred_ranking)
|
||||
inferred_order = odrk.ordering_from_ranking(inferred_ranking,
|
||||
grouped_orderings[0])
|
||||
print("inferred :")
|
||||
print(inferred_order, inferred_ranking)
|
||||
|
||||
# print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
|
||||
|
||||
################################################################################
|
||||
def SSB_main():
|
||||
PARAMETER = "p_color"
|
||||
SUMMED_ATTRIBUTE = "lo_quantity"
|
||||
# SUMMED_ATTRIBUTE = "lo_revenue"
|
||||
# SUMMED_ATTRIBUTE = "lo_extendedprice"
|
||||
LENGTH = 2
|
||||
|
||||
CRITERION = (
|
||||
##### customer table
|
||||
"c_region",
|
||||
"c_city",
|
||||
"c_nation",
|
||||
|
||||
##### part table
|
||||
"p_category",
|
||||
"p_brand",
|
||||
"p_mfgr",
|
||||
"p_color",
|
||||
"p_type",
|
||||
"p_container",
|
||||
|
||||
##### supplier table
|
||||
"s_city",
|
||||
"s_nation",
|
||||
"s_region",
|
||||
|
||||
##### order date
|
||||
# "D_DATE",
|
||||
# "D_DATEKEY",
|
||||
# "D_DATE",
|
||||
# "D_DAYOFWEEK",
|
||||
# "D_MONTH",
|
||||
# "D_YEAR",
|
||||
# "D_YEARMONTHNUM",
|
||||
# "D_YEARMONTH",
|
||||
# "D_DAYNUMINWEEK"
|
||||
# "D_DAYNUMINMONTH",
|
||||
# "D_DAYNUMINYEAR",
|
||||
# "D_MONTHNUMINYEAR",
|
||||
"D_WEEKNUMINYEAR",
|
||||
# "D_SELLINGSEASON",
|
||||
# "D_LASTDAYINWEEKFL",
|
||||
# "D_LASTDAYINMONTHFL",
|
||||
# "D_HOLIDAYFL",
|
||||
# "D_WEEKDAYFL",
|
||||
)
|
||||
|
||||
HYPOTHESIS_ORDERING = ("aquamarine", "dark")
|
||||
|
||||
ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
length=LENGTH,
|
||||
authorized_parameter_values=HYPOTHESIS_ORDERING)
|
||||
print(ordering, ranking)
|
||||
|
||||
grouped_orderings = find_orderings(parameter=PARAMETER,
|
||||
summed_attribute=SUMMED_ATTRIBUTE,
|
||||
criterion=CRITERION,
|
||||
length=LENGTH
|
||||
)
|
||||
|
||||
# grouped_orderings = grouped_orderings[:5]
|
||||
tprint(grouped_orderings, limit=20)
|
||||
# print(grouped_orderings)
|
||||
# inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
|
||||
grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
|
||||
_, inferred_ranking = km.rank_aggregation(grouped_rankings)
|
||||
inferred_ranking = np.array(inferred_ranking)
|
||||
inferred_order = odrk.ordering_from_ranking(inferred_ranking,
|
||||
grouped_orderings[0])
|
||||
print("inferred :")
|
||||
print(inferred_order, inferred_ranking)
|
||||
|
||||
# print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if DATABASE_NAME == "SSB":
|
||||
SSB_main()
|
||||
elif DATABASE_NAME == "flight_delay":
|
||||
flight_delay_main()
|
||||
|
Reference in New Issue
Block a user