Compare commits

..

14 Commits

Author SHA1 Message Date
Oscar Plaisant
eabe42c73f small visual changes 2024-07-03 20:49:24 +02:00
Oscar Plaisant
74d25bd985 beter data indentation 2024-07-02 15:32:55 +02:00
Oscar Plaisant
b97029d090 add documentation 2024-07-02 03:58:06 +02:00
Oscar Plaisant
b13c75542b add comment 2024-07-02 03:57:48 +02:00
Oscar Plaisant
495bb4b63c update 2024-07-02 03:08:46 +02:00
Oscar Plaisant
626b3a7327 update 2024-07-02 03:05:59 +02:00
Oscar Plaisant
e3df57ccde update 2024-07-02 02:32:24 +02:00
Oscar Plaisant
f84aec4456 Better gestion of privates variables.
Add a forgotten f for f-strings.
2024-06-27 17:24:33 +02:00
Oscar Plaisant
b13f8ab039 Use the config file instead of global variables. 2024-06-27 17:24:04 +02:00
Oscar Plaisant
ff0f646d04 modify documentation 2024-06-27 17:23:19 +02:00
Oscar Plaisant
0835eff420 moved into old_concentration_test.py 2024-06-27 17:23:05 +02:00
Oscar Plaisant
87ae317c31 correct mistake 2024-06-27 17:22:08 +02:00
Oscar Plaisant
6a88756e84 add target to execute project and to update requirements if needed. 2024-06-27 17:21:33 +02:00
Oscar Plaisant
be6cbf46c6 add PyYAML (in order to load config) 2024-06-27 17:20:41 +02:00
75 changed files with 712 additions and 1041 deletions

2
.gitignore vendored
View File

@@ -2,7 +2,7 @@ bin/*
include/*
lib/*
share/*
pyenv.cfg
pyvenv.cfg
src/__pycache__/*
.DS_Store
modal_losses N=1000 nb_orders=10000 granularity=1.png

View File

@@ -5,12 +5,15 @@ DATABASE_FILE=${DATABASE_FOLDER}/${DATABASE_NAME}.db
all: execute-script
execute-script:
python3 concentration_test.py
execute-script: requirements.txt
source bin/activate && python3 src/concentration_test.py;
pip-install:
# Install the required python packages
requirements.txt:
bin/pip3 install -r requirements.txt
# run o
reset: delete-database import-from-csv
open:

View File

@@ -1,4 +1,42 @@
# Description of the project
## General informations
The execution is managed via the Makefile.
The python environment is managed via a virtual environment. Its configuration is standard.
If you need to install a new python package, add it to the `requirements.txt` file (using pip syntax). It shoule be installed automatically when you execute the project. Anyway, you can run `make requirements.txt`
The installation of new databases (from csv) is managed in the Makefile.
# Configuration
The configuration is stored the the `src/config.yaml` file.
## Database-specific configuration
`database_name` should contain the name of the database to use. The database has to be stored in the proper directory structure (See the [Directory structure > Datasets](README.md#datasets)). This parameter is case sensitive.
Each database can have a separated and independent config.
It is inside the key name like the database.
For example, the database named `SSB` has its configuration under the `SSB:` key (and this configuration will be used only when `database_name` is `SSB`).
The following table explains every parameter that is used in the database specific configuration.
| key | type | usage |
| --- | ---- | ----- |
| `orders_length` | integer | The length of considered orderings |
| `hypothesis_ordering` | list[str] | The ordering to test the correctness of |
| `parameter` | str | The "parameter" attribute in the query (an attribute in the database). |
| `authorized_parameter_values` | list[str] | The restriction over possibles values in the query's orderings (`WHERE parameter IN authorized_parameter_values`). |
| `summed_attribute` | str | The database attribute that is summed in the aggregation, and used to order the values. |
| `criterion` | list[str] | The list of possibles values for the criteria in the query. When getting a random query, one of these values is chosen randomly for the criteria. |
The `query_generator` key is a parameter containing the name of the query-generator object that is used when building the query. You should not modify this unless you modify the code accordingly.
# Directory structure of the project
## Virtual environment

View File

@@ -5,12 +5,15 @@ from tprint import tprint
import orderankings as odrk
from querying import find_orderings
from kemeny_young import kendall_tau_dist, rank_aggregation
from losses import *
from tqdm import tqdm
from collections import Counter, defaultdict
import joblib
from functools import partial
import random
import yaml
# load configuration from config.yaml
from config import CONFIG as CFG
# Random number generator for the whole program
# RNG = np.random.default_rng(1234)
@@ -18,61 +21,32 @@ import yaml
######################## YAML CONFIG (src/config.yaml) #########################
with open('src/config.yaml') as config_file:
cfg = yaml.load(config_file, Loader=yaml.Loader)
DATABASE_NAME = cfg["database_name"]
DATABASE_NAME = CFG["database_name"]
VERBOSE = cfg["verbose"]["concentration_test"]
VERBOSE = CFG["verbose"]["concentration_test"]
################## DATA SETTINGS (parameters, hypothesis...) ###################
# loaded from src/config.yaml
PARAMETER = tuple(cfg[DATABASE_NAME]["parameter"])
SUMMED_ATTRIBUTE = tuple(cfg[DATABASE_NAME]["summmed_attribute"])
PARAMETER = tuple(CFG[DATABASE_NAME]["parameter"])
SUMMED_ATTRIBUTE = tuple(CFG[DATABASE_NAME]["summmed_attribute"])
# SUMMED_ATTRIBUTE = "lo_revenue"
# SUMMED_ATTRIBUTE = "lo_extendedprice"
LENGTH = cfg[DATABASE_NAME]["orders_length"]
LENGTH = CFG[DATABASE_NAME]["orders_length"]
AUTHORIZED_PARAMETER_VALUES = tuple(cfg[DATABASE_NAME]["authorized_parameter_values"])
AUTHORIZED_PARAMETER_VALUES = tuple(CFG[DATABASE_NAME]["authorized_parameter_values"])
CRITERION = tuple(cfg[DATABASE_NAME]["criterion"])
CRITERION = tuple(CFG[DATABASE_NAME]["criterion"])
HYPOTHESIS_ORDERING = tuple(cfg[DATABASE_NAME]["hypothesis_ordering"])
HYPOTHESIS_ORDERING = tuple(CFG[DATABASE_NAME]["hypothesis_ordering"])
assert len(HYPOTHESIS_ORDERING) == LENGTH
################################ LOSS FUNCTIONS ################################
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
"""This loss is the the average of kendall tau distances between the truth
and each ordering."""
rankings = odrk.rankings_from_orderings(orderings)
true_ranking = odrk.rankings_from_orderings([truth])[0]
return rankings_average_loss(rankings, true_ranking)# }}}
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
length = len(rankings)
# apparently, this is what works for a good normalization
return distance / length
# return distance * 2 / (length * (length - 1))}}}
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
"""Return the kendall tau distance between the truth and the kemeny-young
aggregation of orderings"""
_, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
loss = kendall_tau_dist(
odrk.ranking_from_ordering(aggregation),
odrk.ranking_from_ordering(truth))
return loss
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
################## APPLIED ON SAMPLES FOR CONCENTRATION TESTS ##################
def get_loss_progression(): # {{{
grouped_orderings = find_orderings(parameter=PARAMETER,
@@ -104,7 +78,6 @@ def get_loss_progression(): # {{{
return average_losses, kendal_aggregation_losses
# }}}
################## APPLIED ON SAMPLES FOR CONCENTRATION TESTS ##################
def plot_loss_progression(): # {{{
"""Plot the progression of losses when using more and more of the values

View File

@@ -5,3 +5,4 @@ fastcache
tqdm
joblib
scipy
PyYAML

View File

@@ -0,0 +1 @@
{"duration": 16.941783666610718, "input_args": {"q": "\"\\n SELECT p_color, p_container, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, p_container\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876952.150381}

View File

@@ -1 +0,0 @@
{"duration": 0.014148950576782227, "input_args": {"q": "\"\\n SELECT departure_airport, airline, SUM(nb_flights)\\n FROM fact_table\\n INNER JOIN airport_dim ON airport_dim.iata_code = fact_table.departure_airport\\n NATURAL JOIN hour_dim\\n INNER JOIN time_dim ON time_dim.day = fact_table.date\\n WHERE departure_airport IN ('ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS', 'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA', 'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK')\\n GROUP BY departure_airport, airline\\n ORDER BY SUM(nb_flights) DESC;\\n \""}, "time": 1717674727.832313}

View File

@@ -1 +0,0 @@
{"duration": 12.216989040374756, "input_args": {"q": "\"\\n SELECT p_color, p_container, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, p_container\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717680541.325936}

View File

@@ -0,0 +1 @@
{"duration": 16.9809091091156, "input_args": {"q": "\"\\n SELECT p_color, p_category, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, p_category\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719564027.6609159}

View File

@@ -0,0 +1 @@
{"duration": 12.007299900054932, "input_args": {"q": "\"\\n SELECT p_color, p_brand, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, p_brand\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876425.192638}

View File

@@ -1 +0,0 @@
{"duration": 15.925843238830566, "input_args": {"q": "\"\\n SELECT p_color, c_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, c_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717599419.778661}

View File

@@ -1 +0,0 @@
{"duration": 12.698099851608276, "input_args": {"q": "\"\\n SELECT p_color, s_region, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, s_region\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717599488.394772}

View File

@@ -0,0 +1 @@
{"duration": 12.975467920303345, "input_args": {"q": "\"\\n SELECT p_color, s_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, s_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876439.027987}

View File

@@ -1 +0,0 @@
{"duration": 13.360551118850708, "input_args": {"q": "\"\\n SELECT p_color, s_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, s_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717599475.656039}

View File

@@ -0,0 +1 @@
{"duration": 12.97324800491333, "input_args": {"q": "\"\\n SELECT p_color, p_brand, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, p_brand\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719579491.6208699}

View File

@@ -1 +0,0 @@
{"duration": 0.02377486228942871, "input_args": {"q": "\"\\n SELECT departure_airport, day, SUM(nb_flights)\\n FROM fact_table\\n INNER JOIN airport_dim ON airport_dim.iata_code = fact_table.departure_airport\\n NATURAL JOIN hour_dim\\n INNER JOIN time_dim ON time_dim.day = fact_table.date\\n WHERE departure_airport IN ('ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS', 'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA', 'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK')\\n GROUP BY departure_airport, day\\n ORDER BY SUM(nb_flights) DESC;\\n \""}, "time": 1717674727.8571048}

View File

@@ -0,0 +1 @@
{"duration": 17.964900255203247, "input_args": {"q": "\"\\n SELECT p_color, s_region, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, s_region\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876970.129612}

View File

@@ -1 +0,0 @@
{"duration": 0.00795602798461914, "input_args": {"q": "\"\\n SELECT departure_airport, month, SUM(nb_flights)\\n FROM fact_table\\n INNER JOIN airport_dim ON airport_dim.iata_code = fact_table.departure_airport\\n NATURAL JOIN hour_dim\\n INNER JOIN time_dim ON time_dim.day = fact_table.date\\n WHERE departure_airport IN ('ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS', 'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA', 'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK')\\n GROUP BY departure_airport, month\\n ORDER BY SUM(nb_flights) DESC;\\n \""}, "time": 1717674727.8699038}

View File

@@ -1 +0,0 @@
{"duration": 0.00851297378540039, "input_args": {"q": "\"\\n SELECT departure_airport, year, SUM(nb_flights)\\n FROM fact_table\\n INNER JOIN airport_dim ON airport_dim.iata_code = fact_table.departure_airport\\n NATURAL JOIN hour_dim\\n INNER JOIN time_dim ON time_dim.day = fact_table.date\\n WHERE departure_airport IN ('ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS', 'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA', 'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK')\\n GROUP BY departure_airport, year\\n ORDER BY SUM(nb_flights) DESC;\\n \""}, "time": 1717674727.8793159}

View File

@@ -0,0 +1 @@
{"duration": 14.970414876937866, "input_args": {"q": "\"\\n SELECT p_color, c_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, c_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876469.760285}

View File

@@ -0,0 +1 @@
{"duration": 12.126240015029907, "input_args": {"q": "\"\\n SELECT p_color, p_type, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, p_type\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876753.84258}

View File

@@ -1 +0,0 @@
{"duration": 12.400226831436157, "input_args": {"q": "\"\\n SELECT p_color, p_type, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, p_type\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717680529.093871}

View File

@@ -0,0 +1 @@
{"duration": 14.343026876449585, "input_args": {"q": "\"\\n SELECT p_color, c_region, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('bisque', 'blue')\\n\\n GROUP BY p_color, c_region\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876820.283672}

View File

@@ -0,0 +1 @@
{"duration": 13.514874935150146, "input_args": {"q": "\"\\n SELECT p_color, p_category, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, p_category\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876484.837832}

View File

@@ -0,0 +1 @@
{"duration": 14.327998876571655, "input_args": {"q": "\"\\n SELECT p_color, c_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, c_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719583996.5475771}

View File

@@ -0,0 +1 @@
{"duration": 19.513118982315063, "input_args": {"q": "\"\\n SELECT p_color, c_region, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, c_region\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876989.662223}

View File

@@ -1 +0,0 @@
{"duration": 12.51117491722107, "input_args": {"q": "\"\\n SELECT p_color, p_brand, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, p_brand\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717599462.2399411}

View File

@@ -0,0 +1 @@
{"duration": 18.672475337982178, "input_args": {"q": "\"\\n SELECT p_color, s_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, s_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719564087.0910451}

View File

@@ -0,0 +1 @@
{"duration": 13.761728048324585, "input_args": {"q": "\"\\n SELECT p_color, p_container, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, p_container\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719877064.6480262}

View File

@@ -0,0 +1 @@
{"duration": 14.175416707992554, "input_args": {"q": "\"\\n SELECT p_color, c_nation, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('bisque', 'blue')\\n\\n GROUP BY p_color, c_nation\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876849.762035}

View File

@@ -0,0 +1 @@
{"duration": 15.753787755966187, "input_args": {"q": "\"\\n SELECT p_color, c_region, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'bisque', 'black', 'aquamarine')\\n\\n GROUP BY p_color, c_region\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719882521.904444}

View File

@@ -0,0 +1 @@
{"duration": 14.510737180709839, "input_args": {"q": "\"\\n SELECT p_color, c_city, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN customer ON lo_custkey = c_custkey\\nINNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('azure', 'blue')\\n\\n GROUP BY p_color, c_city\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876891.030114}

View File

@@ -1 +0,0 @@
{"duration": 11.881813049316406, "input_args": {"q": "\"\\n SELECT p_color, p_color, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, p_color\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717599431.864533}

View File

@@ -0,0 +1 @@
{"duration": 11.52425217628479, "input_args": {"q": "\"\\n SELECT p_color, p_container, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nWHERE p_color IN ('bisque', 'blue')\\n\\n GROUP BY p_color, p_container\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719876799.081373}

View File

@@ -1 +0,0 @@
{"duration": 0.0241241455078125, "input_args": {"q": "\"\\n SELECT departure_airport, departure_hour, SUM(nb_flights)\\n FROM fact_table\\n INNER JOIN airport_dim ON airport_dim.iata_code = fact_table.departure_airport\\n NATURAL JOIN hour_dim\\n INNER JOIN time_dim ON time_dim.day = fact_table.date\\n WHERE departure_airport IN ('ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS', 'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA', 'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK')\\n GROUP BY departure_airport, departure_hour\\n ORDER BY SUM(nb_flights) DESC;\\n \""}, "time": 1717674748.1134489}

View File

@@ -0,0 +1 @@
{"duration": 18.256238222122192, "input_args": {"q": "\"\\n SELECT p_color, s_nation, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\nWHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n\\n GROUP BY p_color, s_nation\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1719877024.142326}

View File

@@ -1 +0,0 @@
{"duration": 12.596222877502441, "input_args": {"q": "\"\\n SELECT p_color, p_category, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, p_category\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717599449.712944}

View File

@@ -1 +0,0 @@
{"duration": 13.08634901046753, "input_args": {"q": "\"\\n SELECT p_color, s_nation, SUM(lo_quantity)\\n FROM lineorder\\n INNER JOIN part ON lo_partkey = p_partkey\\nINNER JOIN supplier ON lo_suppkey = s_suppkey\\n\\n WHERE p_color IN ('aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen')\\n GROUP BY p_color, s_nation\\n ORDER BY SUM(lo_quantity) DESC;\\n \""}, "time": 1717680554.546965}

View File

@@ -1,8 +1,10 @@
# first line: 29
# first line: 34
@memory.cache # persistent memoïzation
def query(q: str) -> list[tuple]:
"""Execute a given query and reture the result in a python list[tuple]."""
if VERBOSE: print(f'sending query : {q}')
if VERBOSE:
print(f'sending query : {q}')
res = CUR.execute(str(q))
if VERBOSE: print("got response", res)
if VERBOSE:
print("got response", res)
return res.fetchall()

View File

@@ -1,425 +1,66 @@
import matplotlib.pyplot as plt
from matplotlib.colors import CSS4_COLORS
import numpy as np
from scipy.stats import norm as Norm, beta as Beta, t as Student
from tprint import tprint
import orderankings as odrk
from querying import find_orderings
from kemeny_young import kendall_tau_dist, rank_aggregation
from tqdm import tqdm
from collections import Counter, defaultdict
import joblib
from functools import partial
import random
import querying as qry
import kemeny_young as ky
# Random number generator for the whole program
RNG = np.random.default_rng(1234)
from config import CONFIG as CFG, DATABASE_CFG
VERBOSE = True
VERBOSE = False
######################## YAML CONFIG (src/config.yaml) #########################
DATABASE_NAME = CFG["database_name"]
VERBOSE = CFG["verbose"]["concentration_test"]
HYPOTHESIS_RANKING = odrk.ranking_from_ordering(
DATABASE_CFG["hypothesis_ordering"])
#################### CONCENTRATION TESTS ON RANDOM QUERIES #####################
################## DATA SETTINGS (parameters, hypothesis...) ###################
# """ comment this line when using the SSB dataset
# SSB dataset settings # {{{
PARAMETER = "p_color"
SUMMED_ATTRIBUTE = "lo_quantity"
# SUMMED_ATTRIBUTE = "lo_revenue"
# SUMMED_ATTRIBUTE = "lo_extendedprice"
LENGTH = 2
authorized_parameter_values = {
"p_size": tuple(map(int, range(50))),
"p_color": tuple(CSS4_COLORS.keys()),
}
AUTHORIZED_PARAMETER_VALUES = authorized_parameter_values[PARAMETER]
CRITERION = (
##### customer table
# "c_region",
"c_city",
# "c_nation",
##### part table
"p_category",
"p_brand",
# "p_mfgr",
# "p_color",
# "p_type",
# "p_container",
##### supplier table
"s_city",
# "s_nation",
# "s_region",
##### order date
# "D_DATE",
# "D_DATEKEY",
# "D_DATE",
# "D_DAYOFWEEK",
# "D_MONTH",
# "D_YEAR",
# "D_YEARMONTHNUM",
# "D_YEARMONTH",
# "D_DAYNUMINWEEK"
# "D_DAYNUMINMONTH",
# "D_DAYNUMINYEAR",
# "D_MONTHNUMINYEAR",
# "D_WEEKNUMINYEAR",
# "D_SELLINGSEASON",
# "D_LASTDAYINWEEKFL",
# "D_LASTDAYINMONTHFL",
# "D_HOLIDAYFL",
# "D_WEEKDAYFL",
)
HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
HYPOTHESIS_ORDERING = ("bisque", "blue")
# HYPOTHESIS_ORDERING = [2, 32]
# HYPOTHESIS_ORDERING = [30, 18]
# HYPOTHESIS_ORDERING = [37, 49, 10]
# }}}
""" # flight_delay dataset settings {{{
PARAMETER = "departure_airport"
SUMMED_ATTRIBUTE = "nb_flights"
LENGTH = 3
CRITERION = (
# "airline",
"departure_hour", # simpson's paradox ?
# "day",
# "month",
# "year",
)
def rankings_loss(hypothesis_ranking, rankings: list[list[int]]) -> float:
"""Return the loss for the distance between the hypothesis and the rankings.
It is the kendall-tau distance between the hypothesis, and the kemeny-young
winner of the rankings."""
tau, agg_ranking = ky.rank_aggregation(rankings)
if VERBOSE:
print("rank aggregation fit (τ distance to each aggregated ranking) :",
tau)
print(hypothesis_ranking, agg_ranking)
return ky.kendall_tau_dist(hypothesis_ranking, agg_ranking)
GLOBAL_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'LAS',
'SFO', 'PHX', 'MCO', 'SEA', 'CLT', 'MSP', 'LGA',
'DTW', 'EWR', 'BOS', 'BWI', 'SLC', 'JFK']
AUTHORIZED_PARAMETER_VALUES = GLOBAL_ORDERING
# Correct hypothesis for each length (so the loss converges to 0)
CORRECT_ORDERINGS = defaultdict(lambda: GLOBAL_ORDERING)
CORRECT_ORDERINGS[2] = ['ATL', 'DEN']
CORRECT_ORDERINGS[3] = ['ATL', 'DFW', 'ORD']
CORRECT_ORDERINGS[4] = ['ATL', 'DEN', 'DFW', 'ORD']
CORRECT_ORDERINGS[5] = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX']
# now select the right one according to the LENGTH
CORRECT_ORDERING = CORRECT_ORDERINGS[LENGTH][:LENGTH]
# Use the correct ordering
HYPOTHESIS_ORDERING = CORRECT_ORDERING
print(HYPOTHESIS_ORDERING)
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DWF', 'DEN', 'LAX']
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'LAX', 'DEN', 'IAH'][:LENGTH]
# HYPOTHESIS_ORDERING = ['ATL', 'ORD', 'DFW', 'DEN', 'LAS', 'LAX', 'IAH'][:LENGTH]
# HYPOTHESIS_ORDERING = ['ORD', 'ATL', 'DEN', 'DFW', 'LAX'] # interesting loss curve
assert len(HYPOTHESIS_ORDERING) == LENGTH
# }}}
# """
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
"""This loss is the the average of kendall tau distances between the truth
and each ordering."""
rankings = odrk.rankings_from_orderings(orderings)
true_ranking = odrk.rankings_from_orderings([truth])[0]
return rankings_average_loss(rankings, true_ranking)# }}}
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
distance = sum(kendall_tau_dist(rkng, truth) for rkng in rankings)
length = len(rankings)
# apparently, this is what works for a good normalization
return distance / length
# return distance * 2 / (length * (length - 1))}}}
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> int:# {{{
"""Return the kendall tau distance between the truth and the kemeny-young
aggregation of orderings"""
_, agg_rank = rank_aggregation(odrk.rankings_from_orderings(orderings))
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
loss = kendall_tau_dist(
odrk.ranking_from_ordering(aggregation),
odrk.ranking_from_ordering(truth))
def loss_of_random_query(hypothesis_ranking) -> float:
query = qry.random_query()
rankings = qry.rankings_from_table(query)
loss = rankings_loss(hypothesis_ranking, rankings)
if VERBOSE:
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print("hypothesis ranking :")
print(hypothesis_ranking)
print("rankings :")
print(rankings)
print("loss :")
print(loss)
return loss
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}
def get_loss_progression(): # {{{
grouped_orderings = find_orderings(parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
criterion=CRITERION,
length=LENGTH)
RNG.shuffle(grouped_orderings)
def concentration_test(hypothesis_ranking, N: int) -> list[float]:
loss_list = []
for _ in range(N):
loss = loss_of_random_query(hypothesis_ranking)
loss_list.append(loss)
return loss_list
average_losses = []
kendal_aggregation_losses = []
for nb_considered_orderings in range(1, len(grouped_orderings)+1):
# loss as the average distance from truth to all considered orderings
considered_orderings = grouped_orderings[:nb_considered_orderings]
loss = orderings_average_loss(orderings=considered_orderings,
truth=HYPOTHESIS_ORDERING)
# loss as the distance between truth and the aggregation
kdl_agg_loss = kmny_dist_loss(orderings=considered_orderings,
truth=HYPOTHESIS_ORDERING)
kendal_aggregation_losses.append(kdl_agg_loss)
if VERBOSE:
print(f"using {nb_considered_orderings} orderings")
tprint(considered_orderings)
print("truth :", HYPOTHESIS_ORDERING)
print("loss =", loss)
average_losses.append(loss)
return average_losses, kendal_aggregation_losses
# }}}
def plot_loss_progression(): # {{{
"""Plot the progression of losses when using more and more of the values
(see get_loss_progression)."""
N = 20
avg_loss_progression, kdl_agg_loss_progression = get_loss_progression()
avg_loss_progression = np.array(avg_loss_progression)
kdl_agg_loss_progression = np.array(kdl_agg_loss_progression)
for _ in tqdm(range(N-1), leave=False):
avg_lp, kmny_lp = get_loss_progression()
avg_loss_progression += avg_lp
kdl_agg_loss_progression += kmny_lp
# print(progression)
if VERBOSE:
print(avg_loss_progression)
print(kdl_agg_loss_progression)
plt.plot(avg_loss_progression, color="orange")
plt.plot(kdl_agg_loss_progression, color="green")
# }}}
def get_mode_loss_progression(all_orderings: list[list[str]],
number_of_steps: int,
orders_added_each_step: int =1) -> list[bool]:
# all_rankings = odrk.rankings_from_orderings(all_orderings)
# considered_orderings = list(RNG.choice(all_orderings, size=orders_added_each_step))
considered_orderings = list(random.choices(all_orderings, k=orders_added_each_step))
# count occurrences of each ordering
orderings_count = Counter(map(tuple, considered_orderings))
# loss progression when adding more and more orderings
loss_history = np.zeros(number_of_steps)
# # random permutation of the orderings
# permuted_orderings = np.random.permutation(all_orderings)
for idx in range(number_of_steps):
# new_orders = RNG.choice(all_orderings, size=orders_added_each_step)
new_orders = random.choices(all_orderings, k=orders_added_each_step)
# new_orders = permuted_orderings[orders_added_each_step*idx:orders_added_each_step*(idx+1)]
# considered_orderings.extend(new_orders)
# update the counter of orderings occurrences
orderings_count.update(Counter(map(tuple, new_orders)))
# the most common (modal) ordering
modal_ordering = orderings_count.most_common()[0][0]
modal_ordering = np.array(modal_ordering)
# if VERBOSE: print(modal_ordering)
# the loss is 1 if the modal ordering is the same as the hypothesis
loss = int(not np.array_equal(modal_ordering, HYPOTHESIS_ORDERING))
# loss = int((modal_ordering == HYPOTHESIS_ORDERING).all())
# loss = int(all(map(lambda x: x[0]==x[1],
# zip(modal_ordering, HYPOTHESIS_ORDERING))))
# add loss to the list of losses
loss_history[idx] = loss
if VERBOSE:
# print(loss_history, HYPOTHESIS_ORDERING)
print(orderings_count.most_common(1)[0])
return np.repeat(loss_history, orders_added_each_step)
################################################################################
def plot_modal_losses():
###################
# sampling settings
N = 100 # number of repetitions of the experiment
max_number_of_orders = 7500 # max sample size
GRANULARITY = 12 # granularity of the sampling (orders by iteration)
number_of_steps = max_number_of_orders // GRANULARITY
all_orderings = find_orderings(
parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
criterion=CRITERION,
length=LENGTH,
authorized_parameter_values=AUTHORIZED_PARAMETER_VALUES)
print(f"there are {all_orderings.size} orders in total :")
tprint(all_orderings, limit=10)
# make get_mode_loss_progression parallelizable
gmlp = joblib.delayed(get_mode_loss_progression)
####
# Aggregate multiple simulations
# don't use the tqdm progress bar if there are some logs
range_N = range(N) if VERBOSE else tqdm(range(N))
# for my 8-core computer, n_jobs=7 is empirically the best value
loss_history = joblib.Parallel(n_jobs=7)(
gmlp(all_orderings,
number_of_steps,
orders_added_each_step=GRANULARITY)
for _ in range_N
)
loss_history = np.array(loss_history)
# the sum of losses for each number of steps
losses = np.sum(loss_history, axis=0)
if VERBOSE: print("losses :", losses, sep="\n")
#####
# average
# since losses is the sum of losses, losses/N is the average
mean = losses / N
plt.plot(mean, color="green", label="loss average")
#####
# standard deviation
# variance is (average of squares) - (square of the average)
# since we only have 1 or 0, average of squares is just the average
# so the variance is average - average**2
# stddev is the square root of variance
stddev = np.sqrt(mean - mean**2)
plt.plot(stddev, color="grey", label="loss standard deviation")
############################################################################
# CONFIDENCE INTERVALS
X = np.arange(mean.size) # the x axis
######
## confidence interval
## assuming the experimental variance is the correct one
#confidence = 0.95
#alpha = 1 - confidence
#eta = Norm.ppf(1 - alpha/2, loc=0, scale=1)
#epsilon = eta * stddev / np.sqrt(N)
#plt.fill_between(X, mean - epsilon, mean + epsilon,
# color="blue", alpha=0.25,
# label=f"{100*confidence}% confidence interval")
#####
# confidence interval
# assuming each summed distribution is a normal distribution
confidence = 0.999999
delta = 1 - confidence
# corrected sample variance
S = np.sqrt((1 / N-1) * (mean - mean**2))
eta = Student(df=N-1).ppf(1 - delta/2)
epsilon = eta * stddev / np.sqrt(N)
plt.fill_between(X, mean - epsilon, mean + epsilon,
color="green", alpha=0.2,
label=f"{100*confidence}% confidence interval")
# confidence = 0.95
# delta = 1 - confidence
# eta = Student(df=X-1).ppf(1 - delta/2)
# epsilon = eta * stddev / np.sqrt(X)
# plt.fill_between(X, mean - epsilon, mean + epsilon,
# color="green", alpha=0.5,
# label=f"{100*confidence}% confidence interval")
######
## beta distribution
## confidence = 0.95
#delta = 1 - confidence
#alpha = np.cumsum(1 - loss_history, axis=1).mean(axis=0)
#beta = np.cumsum(loss_history, axis=1).mean(axis=0)
#epsilon = Beta.ppf(1 - delta/2, alpha, beta)
#plt.fill_between(X, mean - epsilon, mean + epsilon,
# color="orange", alpha=0.30,
# label=f"{100*confidence} β confidence interval")
######
## fluctuation interval
#confidence = 0.1
#alpha = 1-confidence
#k = Norm.ppf(alpha/2, loc=0, scale=1)
#fluctuation = k * stddev
#plt.fill_between(X, mean - fluctuation, mean + fluctuation,
# color="orange", alpha=0.25,
# label=f"{100*confidence}% fluctuation interval")
######
## hoeffding
#t = 0.9999999
#plt.plot(X, 2 * np.exp(-2 * t ** 2 / X),
# color="red")
######
## y = 1/2
#plt.plot([0, mean.size], [0.5, 0.5],
# color="orange", alpha=0.25)
if __name__ == '__main__':
rankings = np.array([[1, 3, 2, 4],
[3, 4, 2, 1],
[1, 2, 3, 4],
[1, 3, 2, 4],
[2, 3, 1, 4],
[1, 3, 2, 1],
[2, 3, 1, 4],
[2, 3, 1, 4]])
# all_orderings = find_orderings(parameter=PARAMETER,
# summed_attribute=SUMMED_ATTRIBUTE,
# criterion=CRITERION,
# length=LENGTH)
# # print(all_orderings)
# print(f"There are {len(all_orderings)} orderings in `all_orderings`")
# for _ in range(20):
# dep = time()
# plot_modal_losses()
# print(round(time()-dep, 4))
plt.style.use('dark_background')
# HYPOTHESIS_ORDERING = ("bisque", "aquamarine")
# plot_modal_losses()
HYPOTHESIS_ORDERING = ("bisque", "blue")
plot_modal_losses()
plt.legend()
ax = plt.gca()
# ax.set_ylim([0, 1])
# plt.ion()
plt.show()
print(concentration_test(HYPOTHESIS_RANKING, 5))

29
src/config.py Normal file
View File

@@ -0,0 +1,29 @@
"""
This module loads the yaml config from
"""
from yaml import load as yaml_load, Loader as yaml_Loader
from os import environ # access environment variables
# absolute path to the home of the virtual environment
# doesn't have any trailing "/"
VENV_HOME = environ.get('VIRTUAL_ENV').rstrip('/')
CONFIG_FILE_NAME = 'config.yaml'
# absolute path to the yaml config file
CONFIG_FILE_PATH = f"{VENV_HOME}/src/{CONFIG_FILE_NAME}"
# load the config into the CONFIG variable
with open(CONFIG_FILE_PATH) as config:
CONFIG = yaml_load(config, Loader=yaml_Loader)
# name of the current database (from the config file)
DATABASE_NAME = CONFIG["database_name"]
# configuration specific to the current database
DATABASE_CFG = CONFIG["database"][DATABASE_NAME]
# absolute path to the sqlite database file
DATABASE_FILE = f"{VENV_HOME}/{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"

View File

@@ -2,13 +2,14 @@
# database_name: flight_delay
database_name: SSB
dataset_config:
database:
SSB: # {{{
orders_length: 2
orders_length: 4
# hypothesis_ordering: ['bisque', 'aquamarine']
hypothesis_ordering: ['bisque', 'blue']
# hypothesis_ordering: ['azure', 'blue']
hypothesis_ordering: ['azure', 'bisque', 'black', 'aquamarine']
# hypothesis_ordering: [30, 18]
# hypothesis_ordering: [2, 32]
@@ -18,7 +19,9 @@ dataset_config:
# authorized_parameter_values: !!python/object/apply:builtins.range [0, 50]
parameter: p_color
authorized_parameter_values: !!python/tuple ['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen']
# authorized_parameter_values: !!python/tuple ['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkgrey', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'grey', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgreen', 'lightgrey', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightslategrey', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'rebeccapurple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'slategrey', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen']
authorized_parameter_values: ['azure', 'bisque', 'black', 'aquamarine']
# authorized_parameter_values: ['azure', 'blue']
summed_attribute: lo_quantity
# summed_attribute: lo_revenue
@@ -26,22 +29,22 @@ dataset_config:
criterion:
##### customer table
# - "c_region"
- "c_region"
- "c_city"
# "c_nation"
- "c_nation"
##### part table
- "p_category"
- "p_brand"
# - "p_mfgr"
# - "p_color"
# - "p_type"
# - "p_container"
- "p_type"
- "p_container"
##### supplier table
- "s_city"
# "s_nation"
# "s_region"
- "s_nation"
- "s_region"
##### order date
# - "D_DATE"
@@ -94,10 +97,14 @@ dataset_config:
# }}}
# set which parts of the program should ouput logs
verbose:
# queries to the database (src/querying.py)
querying: false
concentration_test: false
querying: true
concentration_test: true
# memoïze the result of queries
persistent_query_memoization: true

View File

@@ -1,323 +1,325 @@
data = [
(10, 3.2814),
(10, 1.1246),
(10, 1.2786),
(10, 1.4048),
(10, 1.321),
(10, 1.0877),
(10, 1.3789),
(10, 1.2656),
(10, 1.2232),
(10, 1.1576),
(10, 1.0716),
(10, 1.1329),
(10, 1.2229),
(10, 1.0674),
(10, 1.1904),
(10, 1.1503),
(10, 1.1237),
(10, 1.0695),
(10, 1.192),
(10, 1.1163),
(2, 4.985),
(2, 3.4106),
(2, 4.4639),
(2, 3.8917),
(2, 3.5325),
(2, 3.6275),
(2, 3.586),
(2, 3.7085),
(2, 3.5506),
(2, 3.882),
(2, 3.4114),
(2, 2.9221),
(2, 3.0728),
(2, 3.2228),
(2, 3.126),
(2, 3.018),
(2, 2.6121),
(2, 3.3835),
(2, 2.688),
(2, 2.7131),
(3, 4.9138),
(3, 3.6681),
(3, 4.228),
(3, 4.2168),
(3, 3.6797),
(3, 3.2504),
(3, 3.3086),
(3, 3.8523),
(3, 3.4246),
(3, 3.3924),
(3, 3.4794),
(3, 3.3593),
(3, 3.7011),
(3, 3.8801),
(3, 3.6497),
(3, 3.4457),
(3, 3.1876),
(3, 3.3091),
(3, 3.2624),
(3, 3.1918),
(4, 3.996),
(4, 2.3734),
(4, 2.3895),
(4, 2.027),
(4, 2.0217),
(4, 1.9908),
(4, 2.0311),
(4, 1.9258),
(4, 2.0102),
(4, 2.0338),
(4, 2.0078),
(4, 2.0199),
(4, 1.9693),
(4, 2.0876),
(4, 1.9746),
(4, 2.1291),
(4, 2.0353),
(4, 2.0223),
(4, 1.9693),
(4, 2.1176),
(5, 3.6458),
(5, 1.9484),
(5, 2.0161),
(5, 1.999),
(5, 1.9481),
(5, 2.0306),
(5, 2.0121),
(5, 2.0052),
(5, 1.9338),
(5, 1.9788),
(5, 1.8997),
(5, 2.0425),
(5, 2.009),
(5, 2.0407),
(5, 2.5651),
(5, 2.3494),
(5, 4.0412),
(5, 2.3624),
(5, 2.1484),
(5, 2.1279),
(6, 3.0398),
(6, 1.3934),
(6, 1.5696),
(6, 1.3557),
(6, 1.5808),
(6, 1.2172),
(6, 1.4345),
(6, 1.2293),
(6, 1.1803),
(6, 1.5682),
(6, 1.2226),
(6, 1.3786),
(6, 1.1973),
(6, 1.2538),
(6, 1.326),
(6, 1.285),
(6, 1.4086),
(6, 1.4677),
(6, 1.325),
(6, 1.7864),
(6, 2.8935),
(6, 1.4145),
(6, 1.2627),
(6, 1.2306),
(6, 1.4593),
(6, 1.4569),
(6, 1.4273),
(6, 1.2546),
(6, 1.8061),
(6, 1.7507),
(6, 1.8094),
(6, 1.6604),
(6, 1.1203),
(6, 1.5539),
(6, 1.1841),
(6, 1.3447),
(6, 1.318),
(6, 1.2145),
(6, 1.5093),
(6, 1.222),
(7, 2.8026),
(7, 1.2677),
(7, 1.3518),
(7, 1.2646),
(7, 1.3529),
(7, 1.298),
(7, 1.3879),
(7, 1.5377),
(7, 1.6141),
(7, 1.6608),
(7, 1.6938),
(7, 1.5475),
(7, 1.3327),
(7, 1.3387),
(7, 1.3543),
(7, 1.3318),
(7, 1.2613),
(7, 1.3656),
(7, 1.3646),
(7, 1.3082),
(7, 3.7757),
(7, 1.2824),
(7, 1.4717),
(7, 1.3426),
(7, 1.3604),
(7, 1.3191),
(7, 1.3851),
(7, 1.4107),
(7, 1.3291),
(7, 1.3861),
(7, 1.2749),
(7, 1.3441),
(7, 1.2875),
(7, 1.285),
(7, 1.4011),
(7, 1.285),
(7, 1.4398),
(7, 1.3175),
(7, 1.1406),
(7, 1.1148),
(7, 2.9924),
(7, 1.3008),
(7, 1.3184),
(7, 1.3205),
(7, 1.3085),
(7, 1.3275),
(7, 1.3117),
(7, 1.2819),
(7, 1.3389),
(7, 1.3741),
(7, 1.3308),
(7, 1.2763),
(7, 1.3069),
(7, 1.3578),
(7, 1.3264),
(7, 1.3716),
(7, 1.2968),
(7, 1.3645),
(7, 1.3726),
(7, 1.1437),
(7, 2.8074),
(7, 1.2116),
(7, 1.2206),
(7, 1.3141),
(7, 1.1898),
(7, 1.3442),
(7, 1.1675),
(7, 1.4256),
(7, 1.2796),
(7, 1.3477),
(7, 1.3515),
(7, 1.0426),
(7, 1.2668),
(7, 1.3067),
(7, 1.342),
(7, 1.2743),
(7, 1.3513),
(7, 1.6219),
(7, 1.6259),
(7, 1.6586),
(8, 2.7135),
(8, 1.0404),
(8, 1.2629),
(8, 1.0612),
(8, 1.1745),
(8, 1.1316),
(8, 0.9676),
(8, 1.1561),
(8, 0.9848),
(8, 1.1405),
(8, 1.1975),
(8, 1.0905),
(8, 1.3382),
(8, 1.2419),
(8, 1.221),
(8, 1.2209),
(8, 1.2595),
(8, 1.2315),
(8, 1.1985),
(8, 1.5726),
(8, 2.9819),
(8, 1.1447),
(8, 1.4281),
(8, 1.5031),
(8, 1.4433),
(8, 1.7052),
(8, 1.611),
(8, 1.3322),
(8, 1.2052),
(8, 1.3051),
(8, 1.0381),
(8, 1.1987),
(8, 1.1742),
(8, 1.2184),
(8, 0.9659),
(8, 1.0336),
(8, 1.2008),
(8, 1.23),
(8, 1.1227),
(8, 1.084),
(8, 3.4243),
(8, 1.5459),
(8, 1.705),
(8, 1.4039),
(8, 1.1903),
(8, 1.1655),
(8, 1.1943),
(8, 1.2169),
(8, 1.1924),
(8, 1.2306),
(8, 1.1635),
(8, 1.1598),
(8, 1.2742),
(8, 1.1646),
(8, 1.034),
(8, 1.2087),
(8, 1.1515),
(8, 1.145),
(8, 1.2855),
(8, 1.0425),
(8, 2.9917),
(8, 1.2165),
(8, 1.187),
(8, 1.1772),
(8, 1.2726),
(8, 1.1411),
(8, 1.2505),
(8, 1.2163),
(8, 1.2172),
(8, 1.1765),
(8, 1.2291),
(8, 1.2302),
(8, 1.195),
(8, 1.3805),
(8, 1.4443),
(8, 1.4463),
(8, 1.535),
(8, 1.5171),
(8, 1.2004),
(8, 1.2866),
(8, 2.9194),
(8, 1.1209),
(8, 1.1777),
(8, 1.1953),
(8, 1.3267),
(8, 1.2001),
(8, 1.2174),
(8, 1.1995),
(8, 1.294),
(8, 1.1856),
(8, 1.1948),
(8, 1.235),
(8, 1.1608),
(8, 1.2643),
(8, 1.3034),
(8, 1.5058),
(8, 1.4037),
(8, 1.6096),
(8, 1.4336),
(8, 1.3659),
(10, 3.2814),
(10, 1.1246),
(10, 1.2786),
(10, 1.4048),
(10, 1.321),
(10, 1.0877),
(10, 1.3789),
(10, 1.2656),
(10, 1.2232),
(10, 1.1576),
(10, 1.0716),
(10, 1.1329),
(10, 1.2229),
(10, 1.0674),
(10, 1.1904),
(10, 1.1503),
(10, 1.1237),
(10, 1.0695),
(10, 1.192),
(10, 1.1163),
(2, 4.985),
(2, 3.4106),
(2, 4.4639),
(2, 3.8917),
(2, 3.5325),
(2, 3.6275),
(2, 3.586),
(2, 3.7085),
(2, 3.5506),
(2, 3.882),
(2, 3.4114),
(2, 2.9221),
(2, 3.0728),
(2, 3.2228),
(2, 3.126),
(2, 3.018),
(2, 2.6121),
(2, 3.3835),
(2, 2.688),
(2, 2.7131),
(3, 4.9138),
(3, 3.6681),
(3, 4.228),
(3, 4.2168),
(3, 3.6797),
(3, 3.2504),
(3, 3.3086),
(3, 3.8523),
(3, 3.4246),
(3, 3.3924),
(3, 3.4794),
(3, 3.3593),
(3, 3.7011),
(3, 3.8801),
(3, 3.6497),
(3, 3.4457),
(3, 3.1876),
(3, 3.3091),
(3, 3.2624),
(3, 3.1918),
(4, 3.996),
(4, 2.3734),
(4, 2.3895),
(4, 2.027),
(4, 2.0217),
(4, 1.9908),
(4, 2.0311),
(4, 1.9258),
(4, 2.0102),
(4, 2.0338),
(4, 2.0078),
(4, 2.0199),
(4, 1.9693),
(4, 2.0876),
(4, 1.9746),
(4, 2.1291),
(4, 2.0353),
(4, 2.0223),
(4, 1.9693),
(4, 2.1176),
(5, 3.6458),
(5, 1.9484),
(5, 2.0161),
(5, 1.999),
(5, 1.9481),
(5, 2.0306),
(5, 2.0121),
(5, 2.0052),
(5, 1.9338),
(5, 1.9788),
(5, 1.8997),
(5, 2.0425),
(5, 2.009),
(5, 2.0407),
(5, 2.5651),
(5, 2.3494),
(5, 4.0412),
(5, 2.3624),
(5, 2.1484),
(5, 2.1279),
(6, 3.0398),
(6, 1.3934),
(6, 1.5696),
(6, 1.3557),
(6, 1.5808),
(6, 1.2172),
(6, 1.4345),
(6, 1.2293),
(6, 1.1803),
(6, 1.5682),
(6, 1.2226),
(6, 1.3786),
(6, 1.1973),
(6, 1.2538),
(6, 1.326),
(6, 1.285),
(6, 1.4086),
(6, 1.4677),
(6, 1.325),
(6, 1.7864),
(6, 2.8935),
(6, 1.4145),
(6, 1.2627),
(6, 1.2306),
(6, 1.4593),
(6, 1.4569),
(6, 1.4273),
(6, 1.2546),
(6, 1.8061),
(6, 1.7507),
(6, 1.8094),
(6, 1.6604),
(6, 1.1203),
(6, 1.5539),
(6, 1.1841),
(6, 1.3447),
(6, 1.318),
(6, 1.2145),
(6, 1.5093),
(6, 1.222),
(7, 2.8026),
(7, 1.2677),
(7, 1.3518),
(7, 1.2646),
(7, 1.3529),
(7, 1.298),
(7, 1.3879),
(7, 1.5377),
(7, 1.6141),
(7, 1.6608),
(7, 1.6938),
(7, 1.5475),
(7, 1.3327),
(7, 1.3387),
(7, 1.3543),
(7, 1.3318),
(7, 1.2613),
(7, 1.3656),
(7, 1.3646),
(7, 1.3082),
(7, 3.7757),
(7, 1.2824),
(7, 1.4717),
(7, 1.3426),
(7, 1.3604),
(7, 1.3191),
(7, 1.3851),
(7, 1.4107),
(7, 1.3291),
(7, 1.3861),
(7, 1.2749),
(7, 1.3441),
(7, 1.2875),
(7, 1.285),
(7, 1.4011),
(7, 1.285),
(7, 1.4398),
(7, 1.3175),
(7, 1.1406),
(7, 1.1148),
(7, 2.9924),
(7, 1.3008),
(7, 1.3184),
(7, 1.3205),
(7, 1.3085),
(7, 1.3275),
(7, 1.3117),
(7, 1.2819),
(7, 1.3389),
(7, 1.3741),
(7, 1.3308),
(7, 1.2763),
(7, 1.3069),
(7, 1.3578),
(7, 1.3264),
(7, 1.3716),
(7, 1.2968),
(7, 1.3645),
(7, 1.3726),
(7, 1.1437),
(7, 2.8074),
(7, 1.2116),
(7, 1.2206),
(7, 1.3141),
(7, 1.1898),
(7, 1.3442),
(7, 1.1675),
(7, 1.4256),
(7, 1.2796),
(7, 1.3477),
(7, 1.3515),
(7, 1.0426),
(7, 1.2668),
(7, 1.3067),
(7, 1.342),
(7, 1.2743),
(7, 1.3513),
(7, 1.6219),
(7, 1.6259),
(7, 1.6586),
(8, 2.7135),
(8, 1.0404),
(8, 1.2629),
(8, 1.0612),
(8, 1.1745),
(8, 1.1316),
(8, 0.9676),
(8, 1.1561),
(8, 0.9848),
(8, 1.1405),
(8, 1.1975),
(8, 1.0905),
(8, 1.3382),
(8, 1.2419),
(8, 1.221),
(8, 1.2209),
(8, 1.2595),
(8, 1.2315),
(8, 1.1985),
(8, 1.5726),
(8, 2.9819),
(8, 1.1447),
(8, 1.4281),
(8, 1.5031),
(8, 1.4433),
(8, 1.7052),
(8, 1.611),
(8, 1.3322),
(8, 1.2052),
(8, 1.3051),
(8, 1.0381),
(8, 1.1987),
(8, 1.1742),
(8, 1.2184),
(8, 0.9659),
(8, 1.0336),
(8, 1.2008),
(8, 1.23),
(8, 1.1227),
(8, 1.084),
(8, 3.4243),
(8, 1.5459),
(8, 1.705),
(8, 1.4039),
(8, 1.1903),
(8, 1.1655),
(8, 1.1943),
(8, 1.2169),
(8, 1.1924),
(8, 1.2306),
(8, 1.1635),
(8, 1.1598),
(8, 1.2742),
(8, 1.1646),
(8, 1.034),
(8, 1.2087),
(8, 1.1515),
(8, 1.145),
(8, 1.2855),
(8, 1.0425),
(8, 2.9917),
(8, 1.2165),
(8, 1.187),
(8, 1.1772),
(8, 1.2726),
(8, 1.1411),
(8, 1.2505),
(8, 1.2163),
(8, 1.2172),
(8, 1.1765),
(8, 1.2291),
(8, 1.2302),
(8, 1.195),
(8, 1.3805),
(8, 1.4443),
(8, 1.4463),
(8, 1.535),
(8, 1.5171),
(8, 1.2004),
(8, 1.2866),
(8, 2.9194),
(8, 1.1209),
(8, 1.1777),
(8, 1.1953),
(8, 1.3267),
(8, 1.2001),
(8, 1.2174),
(8, 1.1995),
(8, 1.294),
(8, 1.1856),
(8, 1.1948),
(8, 1.235),
(8, 1.1608),
(8, 1.2643),
(8, 1.3034),
(8, 1.5058),
(8, 1.4037),
(8, 1.6096),
(8, 1.4336),
(8, 1.3659)
]

View File

@@ -1,11 +1,14 @@
"""
This Module defines functions to compute the kendall tau distance between two
rankings, and the kemeny-young rank aggregation method.
"""
import numpy as np
from numba import jit, njit
from itertools import permutations
from tools import combinations_of_2
from tools import combinations_of_2, Number
from tqdm import tqdm
from tprint import tprint
Number = int|float
# original, unoptimized version, but it's more readable
# def kendall_tau_dist(rank_a, rank_b) -> int:
@@ -18,10 +21,10 @@ Number = int|float
def kendall_tau_dist(ranking_a: list[int], ranking_b: list[int]) -> Number:
"""The kendall τ distance between two rankings / permutations.
It is the number of inversions that don't have the same sign within all pairs of an inversion of ranking_a and an inversion of ranking_b.
It is the number of inversions that don't have the same sign within all
pairs of an inversion of ranking_a and an inversion of ranking_b.
"""
ranking_a = np.array(ranking_a)
ranking_b = np.array(ranking_b)
@@ -42,11 +45,12 @@ def __tau(A: list[int], B: list[int]) -> int:
def rank_aggregation(rankings: list[list[int]]) -> tuple[int, tuple[int, ...]]:
"""Brute-force kemeny-young rank aggregation.
"""Return the order elected by the kemeny-young method.
Args:
ranks: A list of the ranks (2D numpy array).
ranks: A list of the ranks (2D numpy array) to elect from.
Returns:
int, list: The minimal sum of distances to ranks, the rank of minimal distance.
int, list: The minimal sum of distances to ranks, the rank of minimal
distance.
"""
rankings = np.array(rankings)
min_dist: int = np.inf
@@ -67,6 +71,9 @@ def rank_aggregation(rankings: list[list[int]]) -> tuple[int, tuple[int, ...]]:
return min_dist, best_ranking
#################################### TESTS #####################################
if __name__ == '__main__':
ranks = np.array([[0, 1, 2, 3, 4],
[0, 1, 3, 2, 4],
@@ -76,17 +83,29 @@ if __name__ == '__main__':
# print(rank_aggregation(ranks))
# print(kendall_tau_dist([1, 2, 3],
# [3, 1, 2]))
rankings = np.argsort(list('abc')), np.argsort(list('bda'))
a, b = rankings[0], rankings[1]
print(a, b)
print(rank_aggregation(rankings))
print(rank_aggregation([[1, 2, 3], [2, 4, 1]]))
ranks = np.array(list(permutations(range(7))))
for _ in tqdm(range(10)):
selected_lines = np.random.randint(ranks.shape[0], size=30)
selected = ranks[selected_lines,:]
print(rank_aggregation(selected))
# tprint(selected)
# print(ranks)
# print(kendalltau_dist(ranks[5], ranks[-1]))
# print(np_kendalltau_dist(ranks[5], ranks[-1]))
orderings = np.array([["salut", "coucou", "bonjour"],
["coucou", "hello", "bonjour"],
["hey", "salut", "coucou"],
["bonjour", "coucou", "hey"]])
print(rank_aggregation(np.argsort(orderings, axis=1)))
print(rank_aggregation(np.vectorize(hash)(orderings)))
print(np.vectorize(hash)(orderings))
# ranks = np.array(list(permutations(range(7))))
# for _ in tqdm(range(10)):
# selected_lines = np.random.randint(ranks.shape[0], size=30)
# selected = ranks[selected_lines,:]
# print(rank_aggregation(selected))
# # tprint(selected)
# # print(ranks)
# # print(kendalltau_dist(ranks[5], ranks[-1]))
# # print(np_kendalltau_dist(ranks[5], ranks[-1]))

32
src/losses.py Normal file
View File

@@ -0,0 +1,32 @@
from tools import Number
import orderankings as odrk
import kemeny_young as ky
def orderings_average_loss(orderings: list[list[str]], truth: list[str]) -> float:# {{{
"""This loss is the the average of kendall tau distances between the truth
and each ordering."""
rankings = odrk.rankings_from_orderings(orderings)
true_ranking = odrk.rankings_from_orderings([truth])[0]
return rankings_average_loss(rankings, true_ranking)# }}}
def rankings_average_loss(rankings: list[list[int]], truth: list[int]) -> float:# {{{
distance = sum(ky.kendall_tau_dist(rkng, truth) for rkng in rankings)
length = len(rankings)
# apparently, this is what works for a good normalization
return distance / length
# return distance * 2 / (length * (length - 1))}}}
def kmny_dist_loss(orderings: list[list[str]], truth: list[str]) -> Number:# {{{
"""Return the kendall tau distance between the truth and the kemeny-young
aggregation of orderings"""
_, agg_rank = ky.rank_aggregation(odrk.rankings_from_orderings(orderings))
aggregation = odrk.ordering_from_ranking(agg_rank, truth)
loss = ky.kendall_tau_dist(
odrk.ranking_from_ordering(aggregation),
odrk.ranking_from_ordering(truth))
return loss
# print(aggregation, HYPOTHESIS_ORDERING, kdl_agg_dist)}}}

View File

@@ -12,10 +12,13 @@ you index to get back the values from the indexes.
Rankings are similar to mathematical "permutations".
"""
import numpy as np
from tprint import tprint
from kemeny_young import rank_aggregation
VERBOSE=False
from kemeny_young import rank_aggregation
from tprint import tprint
from collections import defaultdict
VERBOSE = False
# def inverse_permutation(permutation: list[int]) -> list[int]:
# """Return the inverse of a given permutation."""
@@ -39,8 +42,7 @@ def inverse_permutation(permutation: list[int]) -> list[int]:
return inverse
def get_orderings_from_table(table: np.ndarray, column_index: int =0) -> list:
def get_orderings_from_table(table: np.ndarray, column_index: int = 0) -> list:
"""Extract a list of orderings from a table coming out of a sql query.
This basically means that you extract values of the given column, while
keeping order but removing duplicates.
@@ -51,13 +53,19 @@ def get_orderings_from_table(table: np.ndarray, column_index: int =0) -> list:
extract the orderings from.
"""
table = np.array(table)
values = table[:,column_index]
values = table[:, column_index]
ranking, indexes = np.unique(values, return_index=True)
return values[np.sort(indexes)] # distinct ordered values
def get_all_orderings_from_table(table: list[tuple]) -> dict:
orders = dict()
def get_all_orderings_from_table(table: list[list[str]]) -> dict[str, list[str]]:
"""Return a dictionnary mapping a value of the criteria to the order you
get when selecting on this value.
This means you get all orders of a table, where the criteria is in the
second column.
IMPORTANT: this function assumes that values are already sorted
appropriately. If not, the resulting orders won't be correct."""
orders = defaultdict()
for line in table:
parameter, criteria, sum_value = line
if orders.get(criteria) is None:
@@ -73,7 +81,8 @@ def rankings_from_orderings(orderings: list[list[str]]) -> list[list[int]]:
matching ordering into alphabetical order.
"""
orderings = np.array(orderings)
rankings = np.argsort(orderings, axis=1)
# rankings = np.argsort(orderings, axis=1)
rankings = np.vectorize(hash)(orderings)
if VERBOSE:
print("found rankings :")
tprint(rankings)
@@ -83,6 +92,7 @@ def rankings_from_orderings(orderings: list[list[str]]) -> list[list[int]]:
def ranking_from_ordering(ordering: list[str]) -> list[int]:
return rankings_from_orderings([ordering])[0]
def ordering_from_ranking(ranking: list[int], values_to_order: list[str]) -> list[str]:
"""Get an order of values from a ranking of these values.
This is basically the inverse function of *rankings_from_orderings*.
@@ -99,25 +109,25 @@ def ordering_from_ranking(ranking: list[int], values_to_order: list[str]) -> lis
return np.sort(values_to_order)[inversed_ranking]
# def ordering_from_ranking(ranking: list[int],
# reference_ordering: list[str],
# reference_ranking: list[int]):
# """Get an ordering of values from a ranking, using a reference ordering and
# ranking (the ranking must match the ordering)."""
# # make sure you are using numpy arrays
# ref_ordering = np.array(reference_ordering)
# ref_ranking = np.array(reference_ranking)
# # get back the best order from the best ranking
# ordering = ref_ordering[ref_ranking[[ranking]]][0]
# if VERBOSE: print("best ordering :", ordering)
# return ordering
def ordering_from_ranking(ranking: list[int],
reference_ordering: list[str],
reference_ranking: list[int]):
"""Get an ordering of values from a ranking, using a reference ordering and
ranking (the ranking must match the ordering)."""
# make sure you are using numpy arrays
ref_ordering = np.array(reference_ordering)
ref_ranking = np.array(reference_ranking)
# get back the best order from the best ranking
ordering = ref_ordering[ref_ranking[[ranking]]][0]
if VERBOSE: print("best ordering :", ordering)
return ordering
def aggregate_rankings(rankings: list[list[int]]) -> tuple[int, ...]:
"""Calculate the aggregation of all given rankings, that is the ranking
that is the nearest to all given rankings."""
min_dist, best_ranking = rank_aggregation(rankings)
if VERBOSE: print("best ranking :", best_ranking)
if VERBOSE:
print("best ranking :", best_ranking)
return best_ranking

View File

@@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
class QueryGenerator(ABC):
"""Abstract class to define what methods should a query generator have."""
@abstractmethod
def __init__(self): ...
@@ -14,18 +15,23 @@ class QueryGenerator(ABC):
def __str__(self) -> str: ...
class QueryWithParameter(QueryGenerator):
# DEFAULT_AUTHORIZED_PARAMETER_VALUES: tuple[str, ...] = ("foo", "bar")
class QueryWithParameter(QueryGenerator, ABC):
"""Abstract class for query generators with our 3 parameters.
This class implements the gestion of 3 attributes : `parameter`,
`authorized_parameter_values` and `summed_attribute`. They are managed so
that there is no typing error, and using default values. Importantly, the
default value of authorized_parameter_values (when not given or set to
None) is the the value of `self.DEFAULT_AUTHORIZED_PARAMETER_VALUES`.
"""
def __init__(self, parameter: str|None =None,
authorized_parameter_values: tuple[str, ...] | None = None,
summed_attribute: str|None =None):
if parameter is None: raise ValueError
self.parameter = str(parameter)
self.__parameter = str(parameter)
if authorized_parameter_values is None:
authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
self.authorized_parameter_values = authorized_parameter_values
self.__authorized_parameter_values = authorized_parameter_values
self.__force_typing_on_authorized_parameter_values()
if summed_attribute is None: raise ValueError
self.summed_attribute = str(summed_attribute)
@@ -39,6 +45,8 @@ class QueryWithParameter(QueryGenerator):
self.__parameter = str(value)
def __force_typing_on_authorized_parameter_values(self):
if self.__authorized_parameter_values is None:
self.__authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
self.__authorized_parameter_values = tuple(
map(str, self.__authorized_parameter_values))
@@ -54,6 +62,8 @@ class QueryWithParameter(QueryGenerator):
class QueryWithParameterGroupedByCriteria(QueryWithParameter):
"""Similar to QueryWithParameter, but with an addtional parameter : `criteria`.
The results are grouped by criteria, and values of `summed_attribute` are summed for each `parameter`, to give an order on `parameter`'s values"""
def __init__(self, parameter: str|None =None,
authorized_parameter_values: tuple[str, ...] | None =None,
@@ -67,7 +77,7 @@ class QueryWithParameterGroupedByCriteria(QueryWithParameter):
authorized_parameter_values = self.DEFAULT_AUTHORIZED_PARAMETER_VALUES
self.authorized_parameter_values = authorized_parameter_values
self.criteria = criteria
self.__criteria = str(criteria)
if summed_attribute is None: raise ValueError
self.summed_attribute = str(summed_attribute)
@@ -162,7 +172,7 @@ class QuerySSBWithParameterGroupedByCriteria(QueryWithParameterGroupedByCriteria
res += "INNER JOIN date ON lo_orderdate = D_DATEKEY\n"
if self.authorized_parameter_values is not None:
res += "WHERE {self.parameter} IN {self.authorized_parameter_values}\n"
res += f"WHERE {self.parameter} IN {self.authorized_parameter_values}\n"
res += f"""

View File

@@ -1,56 +1,109 @@
import sqlite3
import numpy as np
from random import choice
from tprint import tprint
from joblib import Memory # for persistent memoïzation
from query_generator import *
import orderankings as odrk
import kemeny_young as km
from joblib import Memory
from config import CONFIG, DATABASE_CFG, VENV_HOME, DATABASE_FILE
# persistent memoïzation
memory = Memory("cache")
if CONFIG["persistent_query_memoization"]:
memory = Memory(f"{VENV_HOME}/src/cache")
else:
# if memoïzation is disabled, then just use the false memoization decorator
class FalseMemory:
def cache(self, func):
"""This is a decorator that does nothing to its function."""
return func
memory = FalseMemory()
DATABASE_NAME = "flight_delay"
DATABASE_NAME = "SSB"
VERBOSE = CONFIG["verbose"]["querying"]
################################################################################
# Connexion to sqlite database
odrk.VERBOSE = False
VERBOSE = True
######################### Connexion to sqlite database #########################
# initialize database connection
DATABASE_FILE = f"../{DATABASE_NAME}_dataset/{DATABASE_NAME}.db"
if VERBOSE: print(f"connecting to {DATABASE_FILE}")
if VERBOSE:
print(f"connecting to {DATABASE_FILE}")
CON = sqlite3.connect(DATABASE_FILE)
CUR = CON.cursor()
@memory.cache # persistent memoïzation
def query(q: str) -> list[tuple]:
"""Execute a given query and reture the result in a python list[tuple]."""
if VERBOSE: print(f'sending query : {q}')
if VERBOSE:
print(f'sending query : {q}')
res = CUR.execute(str(q))
if VERBOSE: print("got response", res)
if VERBOSE:
print("got response", res)
return res.fetchall()
################################################################################
# Choice of the right query generator
if DATABASE_NAME == "flight_delay":
QUERY_PARAM_GB_FACTORY = QueryFlightWithParameterGroupedByCriteria
QUERY_PARAM_FACTORY = QueryFlightWithParameter
elif DATABASE_NAME == "SSB":
QUERY_PARAM_GB_FACTORY = QuerySSBWithParameterGroupedByCriteria
QUERY_PARAM_FACTORY = QuerySSBWithParameter
##################### Choice of the right query generator ######################
################################################################################
# orderings extraction functions
QUERY_PARAM_GB_CONSTRUCTOR = DATABASE_CFG["query_generator"]
######################## orderings extraction functions ########################
def random_query() -> list[tuple]:
random_criteria = choice(DATABASE_CFG["criterion"])
qg_constructor = DATABASE_CFG["query_generator"]
sql_query = qg_constructor(
parameter=DATABASE_CFG["parameter"],
authorized_parameter_values=DATABASE_CFG["authorized_parameter_values"],
criteria=random_criteria,
summed_attribute=DATABASE_CFG["summed_attribute"])
# print the query
if VERBOSE: print("query :", str(sql_query), sep="\n")
result = query(str(sql_query)) # get result from database
if VERBOSE: # print the result
print("query result :")
tprint(result)
return result
def filter_correct_length_orderings(orderings: list[tuple], length: int) -> list[tuple]:
"""Keep only orders that are of the specified length that means removing
too short ones, and slicing too long ones."""
correct_length_orderings = np.array(
[ordrng[:length] for ordrng in orderings if len(ordrng) >= length]
)
if VERBOSE:
print(f"found {len(correct_length_orderings)} orderings :")
# print(correct_length_orderings)
tprint(correct_length_orderings)
return correct_length_orderings
def rankings_from_table(query_result: list[tuple]):
orderings_dict = odrk.get_all_orderings_from_table(query_result)
orderings = orderings_dict.values()
orderings = filter_correct_length_orderings(
orderings,
DATABASE_CFG["orders_length"])
if VERBOSE:
print(orderings)
rankings = odrk.rankings_from_orderings(orderings)
return rankings
@memory.cache # persistent memoïzation
def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str, ...],
length: int,
authorized_parameter_values: list[str] =None
authorized_parameter_values: tuple[str, ...] | None = None
) -> list[list[str]]:
"""Gather the list of every ordering returned by queries using given values
of parameter, summed_attribute, and all given values of criterion.
@@ -63,16 +116,13 @@ def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str,
Returns:
list[list]: The list of all found orderings.
"""
# instanciate the query generator
qg = QUERY_PARAM_GB_FACTORY(parameter=parameter,
summed_attribute=summed_attribute,
criteria=None)
if authorized_parameter_values is None:
# reduce the number of compared parameter values
qg.authorized_parameter_values = qg.authorized_parameter_values#[:length]
else:
qg.authorized_parameter_values = authorized_parameter_values#[:length]
# instanciate the query generator
qg = DATABASE_CFG["query_generator"](
parameter=parameter,
authorized_parameter_values=authorized_parameter_values,
summed_attribute=summed_attribute,
criteria=None)
# ensemble de tous les ordres trouvés
# la clef est la valeur dans la colonne criteria
@@ -90,159 +140,6 @@ def find_orderings(parameter: str, summed_attribute: str, criterion: tuple[str,
# update the global list of all found orders
orderings.extend(table_orders.values())
# keep only orders that are of the specified length
# that means removing too short ones, and slicing too long ones
correct_length_orderings = np.array(
[ordrng[:length] for ordrng in orderings if len(ordrng) >= length]
)
if VERBOSE:
print(f"found {len(correct_length_orderings)} orderings :")
print(correct_length_orderings)
# tprint(correct_length_orderings)
correct_length_orderings = filter_correct_length_orderings(orderings, length)
return correct_length_orderings
@memory.cache # persistent memoïzation
def find_true_ordering_ranking(parameter: str,
summed_attribute: str,
length: int,
authorized_parameter_values: tuple[str,...]|None =None
) -> tuple[list[list[str]], list[list[int]]]:
"""Return the true (ordering, ranking), considering the data as a whole (no
grouping by), and getting the true order (no rankings aggregation)."""
if authorized_parameter_values is None:
qg = QUERY_PARAM_FACTORY(parameter=parameter,
summed_attribute=summed_attribute)
else:
qg = QUERY_PARAM_FACTORY(parameter=parameter,
summed_attribute=summed_attribute,
authorized_parameter_values=authorized_parameter_values)
# qg.authorized_parameter_values = qg.authorized_parameter_values[:length]
res = query(str(qg))
if VERBOSE: print(res)
ordering = odrk.get_orderings_from_table(res)
ranking = odrk.rankings_from_orderings([ordering])[0]
return ordering, ranking
################################################################################
def flight_delay_main():
PARAMETER = "departure_airport"
SUMMED_ATTRIBUTE = "nb_flights"
LENGTH = 5
ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
length=LENGTH)
print(ordering, ranking)
CRITERION = [
# "airline",
# "departure_hour",
"day",
# "month",
]
rng = np.random.default_rng()
rng.shuffle(CRITERION)
grouped_orderings = find_orderings(parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
criterion=CRITERION,
length=LENGTH)
# grouped_orderings = grouped_orderings[:5]
# tprint(grouped_orderings, limit=20)
print(grouped_orderings)
# inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
_, inferred_ranking = km.rank_aggregation(grouped_rankings)
inferred_ranking = np.array(inferred_ranking)
inferred_order = odrk.ordering_from_ranking(inferred_ranking,
grouped_orderings[0])
print("inferred :")
print(inferred_order, inferred_ranking)
# print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
################################################################################
def SSB_main():
PARAMETER = "p_color"
SUMMED_ATTRIBUTE = "lo_quantity"
# SUMMED_ATTRIBUTE = "lo_revenue"
# SUMMED_ATTRIBUTE = "lo_extendedprice"
LENGTH = 2
CRITERION = (
##### customer table
"c_region",
"c_city",
"c_nation",
##### part table
"p_category",
"p_brand",
"p_mfgr",
"p_color",
"p_type",
"p_container",
##### supplier table
"s_city",
"s_nation",
"s_region",
##### order date
# "D_DATE",
# "D_DATEKEY",
# "D_DATE",
# "D_DAYOFWEEK",
# "D_MONTH",
# "D_YEAR",
# "D_YEARMONTHNUM",
# "D_YEARMONTH",
# "D_DAYNUMINWEEK"
# "D_DAYNUMINMONTH",
# "D_DAYNUMINYEAR",
# "D_MONTHNUMINYEAR",
"D_WEEKNUMINYEAR",
# "D_SELLINGSEASON",
# "D_LASTDAYINWEEKFL",
# "D_LASTDAYINMONTHFL",
# "D_HOLIDAYFL",
# "D_WEEKDAYFL",
)
HYPOTHESIS_ORDERING = ("aquamarine", "dark")
ordering, ranking = find_true_ordering_ranking(parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
length=LENGTH,
authorized_parameter_values=HYPOTHESIS_ORDERING)
print(ordering, ranking)
grouped_orderings = find_orderings(parameter=PARAMETER,
summed_attribute=SUMMED_ATTRIBUTE,
criterion=CRITERION,
length=LENGTH
)
# grouped_orderings = grouped_orderings[:5]
tprint(grouped_orderings, limit=20)
# print(grouped_orderings)
# inferred_ordering = odrk.get_orderings_from_table(inferred_orderings_table)
grouped_rankings = odrk.rankings_from_orderings(grouped_orderings)
_, inferred_ranking = km.rank_aggregation(grouped_rankings)
inferred_ranking = np.array(inferred_ranking)
inferred_order = odrk.ordering_from_ranking(inferred_ranking,
grouped_orderings[0])
print("inferred :")
print(inferred_order, inferred_ranking)
# print("distance =", km.kendall_tau_dist(ranking, inferred_ranking))
if __name__ == '__main__':
if DATABASE_NAME == "SSB":
SSB_main()
elif DATABASE_NAME == "flight_delay":
flight_delay_main()

View File

@@ -2,7 +2,9 @@ import numpy as np
from numba import jit
from fastcache import lru_cache
# @lru_cache(maxsize=16)
Number = int | float
@lru_cache(maxsize=4)
def combinations_of_2(size: int):
"""Returns an array of size n*2, containing every pair of two integers
smaller than size, but not listing twice the pairs with the same numbers
@@ -19,7 +21,7 @@ def __combinations_of_2(size: int):
"""Compiled helper."""
# return np.array(list(combinations(range(size), 2)))
# return np.array(np.meshgrid(np.arange(size), np.arange(size))).T.reshape(-1, 2)
return np.array([[i, j] for i in range(0, size) for j in range(0, size) if i<j])
return np.array([[i, j] for i in range(size) for j in range(size) if i<j])