Skip to content

Commit a723fd3

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 663d6a7 + 8b4a36f commit a723fd3

13 files changed

Lines changed: 3221 additions & 4 deletions

deps.edn

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
clj-thamil/clj-thamil {:mvn/version "0.2.0"}
1414
org.scicloj/clay {#_#_:mvn/version "2-beta54"
1515
:git/url "https://github.com/scicloj/clay.git"
16-
:git/sha "26fb97b5ea2f72a98f8a64d5b8b8abf705767a39"}
16+
:git/sha "c4fa35c11034614d42c48c1223d8767267cad385"}
17+
org.scicloj/kindly {:mvn/version "4-beta20"}
1718
thi.ng/geom {:mvn/version "1.0.1"}
1819
org.eclipse.elk/org.eclipse.elk.core {:mvn/version "0.10.0"}
1920
org.eclipse.elk/org.eclipse.elk.graph {:mvn/version "0.10.0"}

site/db.edn

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@
116116
:email "edwardaw@connect.hku.hk"
117117
:affiliation [:cim]
118118
:links [{:icon "github" :href "https://github.com/W-Edward"}]}
119+
{:id :tombarys
120+
:name "Tomáš Baránek"
121+
:image "https://avatars.githubusercontent.com/u/5729352?v=4"
122+
:url "https://barys.me/#english-section"
123+
:email "tom@barys.me"
124+
:links [{:icon "github" :href "https://github.com/tombarys"}]}
119125
{:id :mattb
120126
:name "Matthias Buehlmaier"
121127
:affiliation [:cim]}
@@ -150,8 +156,7 @@
150156
:url "https://techascent.com"}
151157
{:id :cim
152158
:name "cim"
153-
:url "https://cim.hkubs.hku.hk/"
154-
}]
159+
:url "https://cim.hkubs.hku.hk/"}]
155160

156161
:topic
157162
[{:id :core

src/data_analysis/book_sales_analysis/about_apriori.clj

Lines changed: 349 additions & 0 deletions
Large diffs are not rendered by default.

src/data_analysis/book_sales_analysis/anonymized-all-sliced-sharing-V2.csv

Lines changed: 2001 additions & 0 deletions
Large diffs are not rendered by default.
442 KB
Loading
Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
(ns data-analysis.book-sales-analysis.core-helpers-v2
2+
(:import [java.text Normalizer Normalizer$Form]
3+
[java.io ByteArrayInputStream ObjectInputStream])
4+
(:require [tablecloth.api :as tc]
5+
[tech.v3.dataset :as ds]
6+
[tablecloth.column.api :as tcc]
7+
[clojure.string :as str]
8+
[java-time.api :as jt]
9+
[fastmath.stats :as stats]))
10+
11+
;; ## Data Transformation Functions
12+
;; Common data processing functions used across multiple analysis files
13+
14+
;; ### Scicloj Helpers
15+
16+
(defn merge-csvs [file-list options]
17+
(->> (mapv #(tc/dataset % options) file-list)
18+
(apply tc/concat)))
19+
20+
;; ### Column and Content Sanitizers
21+
22+
(defn sanitize-str
23+
"Sanitizes a string for use as a slug or identifier.
24+
Replaces underscores and spaces with hyphens, removes diacritics and parentheses, and converts to lower-case.
25+
Intended for general-purpose text like book titles."
26+
[s]
27+
(if (or (nil? s) (empty? s))
28+
s
29+
(let [hyphens (str/replace s #"_" "-")
30+
trimmed (str/trim hyphens)
31+
nfd-normalized (Normalizer/normalize trimmed Normalizer$Form/NFD)
32+
no-diacritics (str/replace nfd-normalized #"\p{InCombiningDiacriticalMarks}+" "")
33+
no-spaces (str/replace no-diacritics #" " "-")
34+
no-brackets (str/replace no-spaces #"\(|\)" "")
35+
lower-cased (str/lower-case no-brackets)]
36+
lower-cased)))
37+
38+
39+
(defn sanitize-column-name-str
40+
"Sanitizes a string for use as a dataset column name.
41+
More aggressive than `sanitize-str`, it also converts slashes to hyphens, collapses multiple hyphens,
42+
and removes special substrings like '(YYYY-MM)'."
43+
[s]
44+
(if (or (nil? s) (empty? s))
45+
s
46+
(-> s
47+
(str/replace #"\(YYYY-MM\)" "") ; special removal
48+
str/trim
49+
(str/lower-case)
50+
(str/replace #"_" "-") ; underscore to hyphens
51+
(str/replace #" " "-")
52+
(str/replace #"\/" "-") ; slash to hyphens
53+
(str/replace #"-{2,}" "-") ; multiple hyphens to one
54+
(#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
55+
(str/replace #"\p{InCombiningDiacriticalMarks}+" "") ; no-diacritics
56+
(str/replace #"\(|\)" ""))))
57+
58+
(defn sanitize-category-str
59+
"Sanitizes a string representing categories.
60+
Similar to other sanitizers, but specifically handles comma-separated lists by removing the space
61+
after a comma (e.g., 'a, b' -> 'a,b')."
62+
[s]
63+
(if (or (nil? s) (empty? s))
64+
s
65+
(-> s
66+
str/trim
67+
(str/lower-case)
68+
(str/replace #"\,\s" ",") ; underscore to hyphens
69+
(str/replace #"\s" "-")
70+
(str/replace #"\/" "-") ; slash to hyphens
71+
(str/replace #"-{2,}" "-") ; multiple hyphens to one
72+
(#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
73+
(str/replace #"\p{InCombiningDiacriticalMarks}+" "") ; no-diacritics (dočasně)
74+
(str/replace #"\(|\)" ""))))
75+
76+
(defn parse-book-name [s]
77+
(-> s ;; proti parse-books bere jen řetězec
78+
(str/replace #"," "")
79+
(str/replace #"\+" "")
80+
(str/trim)
81+
sanitize-category-str
82+
(str/replace #"^3" "k3")
83+
(str/replace #"^5" "k5")))
84+
85+
86+
(defn parse-csv-date [date-str]
87+
(let [month-names ["led" "úno" "bře" "dub" "kvě" "čvn" "čvc" "srp" "zář" "říj" "lis" "pro"]
88+
pad-month #(format "%02d" %)
89+
parse-full-date (fn [s]
90+
(let [month (Integer/parseInt (subs s 3 5))]
91+
(str (subs s 6 10) "-01-" (pad-month month))))
92+
parse-short-date (fn [s]
93+
(let [[month-str year-str] (str/split s #"\.")
94+
month (inc (.indexOf month-names month-str))
95+
year (+ 2000 (Integer/parseInt year-str))]
96+
(str year "-01-" (pad-month month))))]
97+
(try
98+
(jt/local-date "yyyy-dd-MM"
99+
(if (> (count date-str) 6)
100+
(parse-full-date date-str)
101+
(parse-short-date date-str)))
102+
(catch Exception _
103+
(str "Chyba: " date-str)))))
104+
105+
(defn parse-books-from-list
106+
"Parses a book names from string `s` separated by commas into vector of cleaned keywords."
107+
[s]
108+
(if (seq s) (->> (str/split s #",\s\d+")
109+
(map #(str/replace % #"\d\s" ""))
110+
(map #(str/replace % #"," ""))
111+
(map #(str/replace % #"\(A\+E\)|\[|\]|komplet|a\+e|\s\(P\+E\+A\)|\s\(e\-kniha\)|\s\(P\+E\)|\s\(P\+A\)|\s\(E\+A\)|papír|papir|audio|e\-kniha|taška" ""))
112+
(map #(str/replace % #"\+" ""))
113+
(map #(str/trim %))
114+
(map sanitize-str)
115+
(map #(str/replace % #"\-\-.+$" ""))
116+
(map #(str/replace % #"\-+$" ""))
117+
(map #(str/replace % #"^3" "k3"))
118+
(map #(str/replace % #"^5" "k5"))
119+
(remove (fn [item] (some (fn [substr] (str/includes? (name item) substr))
120+
["balicek" "poukaz" "zapisnik" "limitovana-edice" "taska" "aktualizovane-vydani" "cd" "puvodni-vydani/neni-skladem"
121+
"merch"])))
122+
distinct
123+
(mapv keyword))
124+
nil))
125+
126+
;; ### Metadata Enriching and Convenience Functions
127+
128+
(defn czech-author? [book-title]
129+
(let [czech-books #{:k30-hodin
130+
:k365-anglickych-cool-fraz-a-vyrazov
131+
:k365-anglickych-cool-frazi-a-vyrazu
132+
:bulbem-zachranare
133+
:hacknuta-cestina
134+
:handmade-byznys
135+
:hot
136+
:hry-site-porno
137+
:jak-na-site
138+
:jak-sbalit-zenu-2.0
139+
:konec-prokrastinace
140+
:let-your-english-september
141+
:myty-a-nadeje-digitalniho-sveta
142+
:na-volne-noze
143+
:napoleonuv-vojak
144+
:nedelni-party-s-picassem
145+
:restart-kreativity
146+
:sport-je-bolest
147+
:stat-se-investorem
148+
:temne-pocatky-ceskych-dejin
149+
:uc-jako-umelec
150+
:velka-kniha-fuckupu
151+
:zamilujte-se-do-anglictiny
152+
:pretizeny
153+
:od-chaosu-ke-smyslu
154+
:very-hard-orechy
155+
:heureka!}]
156+
(if (str/starts-with? (str book-title) "book") ;; this is a part used to add flags of Czech books into fully anonymized dataset
157+
(rand-int 2)
158+
(if (contains? czech-books (keyword book-title)) 1 0))))
159+
160+
;; ### One-Hot Encoding Functions
161+
162+
163+
(defn onehot-encode-by-customers ;; FIXME needs refactor and simplification :)
164+
"One-hot encode dataset aggregated by customer.
165+
Each customer gets one row with 0/1 values for each book they bought.
166+
Used for market basket analysis, customer segmentation, etc."
167+
[raw-ds]
168+
(let [;; First, aggregate all purchases by customer
169+
customer+orders (-> raw-ds
170+
(ds/drop-missing :zakaznik)
171+
(tc/drop-rows #(= "" (str/trim (:zakaznik %))))
172+
(ds/drop-missing :produkt-produkty)
173+
(tc/group-by [:zakaznik])
174+
(tc/aggregate {:all-products #(str/join ", " (tc/column % :produkt-produkty))})
175+
(tc/rename-columns {:summary :all-products}))
176+
;; Get all unique books from all the lines
177+
all-titles (->> (tc/column customer+orders :all-products)
178+
(mapcat parse-books-from-list)
179+
distinct
180+
sort)
181+
;; For each customer create one aggregated row with all purchases in 0/1 format
182+
customers->rows (map
183+
(fn [customer-row]
184+
(let [customer-name (:zakaznik customer-row)
185+
books-bought-set (set (parse-books-from-list (:all-products customer-row)))
186+
one-hot-map (reduce (fn [acc book]
187+
(assoc acc book (if (contains? books-bought-set book) 1 0)))
188+
{}
189+
all-titles)]
190+
(merge {:zakaznik customer-name}
191+
one-hot-map)))
192+
(tc/rows customer+orders :as-maps))
193+
;; Create new dataset from one-hot data
194+
one-hot-ds (tc/dataset customers->rows)]
195+
;; Return dataset with one-hot encoding
196+
one-hot-ds))
197+
198+
199+
;; ### Statistical Functions for Apriori Analysis
200+
201+
(defn calculate-support
202+
"Calculate support for a given itemset in a one-hot-encoded dataset.
203+
Support = (rows containing itemset) / (total rows)"
204+
[dataset itemset]
205+
(let [total-transactions (tc/row-count dataset)
206+
transactions-with-itemset (-> dataset
207+
(tc/select-rows (fn [row] (every? #(not (zero? (get row %))) itemset)))
208+
tc/row-count)]
209+
(if (zero? total-transactions)
210+
0.0
211+
(double (/ transactions-with-itemset total-transactions)))))
212+
213+
214+
^:kindly/hide-code
215+
(defn calculate-adaptive-coefficient
216+
"Calculates adaptive coefficient for popularity bias correction."
217+
[rules popularity-index]
218+
(let [median-lift (tcc/median (map :lift rules))
219+
max-popularity (apply max (vals popularity-index))
220+
;; Calculate coefficient that would reduce a rule with max popularity
221+
;; items on both sides by approximately 50% of median lift
222+
target-coefficient (/ (* 0.5 median-lift) (* 2 max-popularity))]
223+
target-coefficient))
224+
225+
^:kindly/hide-code
226+
(defn improved-adjusted-lift
227+
"Lift adjustment using adaptive popularity bias correction."
228+
[rule popularity-index adaptive-coefficient]
229+
(let [base-lift (:lift rule)
230+
antecedent-items (:antecedent rule)
231+
consequent-items (seq (:consequent rule))
232+
;; Average popularity of items in antecedent
233+
antecedent-popularity (tcc/mean (vals (select-keys popularity-index antecedent-items)))
234+
;; Average popularity of items in consequent
235+
consequent-popularity (tcc/mean (vals (select-keys popularity-index consequent-items)))
236+
;; Dampening factor for popular items
237+
popularity-penalty (+ 1 (* adaptive-coefficient
238+
(+ antecedent-popularity consequent-popularity)))]
239+
;; Divide lift by penalty (popular combinations get reduced lift)
240+
(assoc rule :lift (double (/ base-lift popularity-penalty)))))
241+
242+
243+
;; ### Visuals
244+
245+
(defn color-hex [support min-support max-support]
246+
(let [min-opacity 20
247+
max-opacity 255
248+
;; Map support from [min-support, max-support] to [min-opacity, max-opacity]
249+
opacity (if (= min-support max-support)
250+
;; Handle edge case where min and max are the same
251+
(int (/ (+ min-opacity max-opacity) 2))
252+
(int (+ min-opacity
253+
(* (- max-opacity min-opacity)
254+
(/ (- support min-support)
255+
(- max-support min-support))))))
256+
;; Ensure opacity stays within bounds
257+
clamped-opacity (min max-opacity (max min-opacity opacity))
258+
hex-opacity (format "%02x" clamped-opacity)]
259+
(str "#c1ab55" hex-opacity)))
260+
261+
262+
;; ### Correlation functions
263+
264+
(defn corr-a-x-b
265+
"Creates a correlation matrix with book columns and the added :book column \n
266+
- `ds` is dataset \n
267+
Example: \n
268+
=> _unnamed [2 3]: \n
269+
| :a | :b | :book |
270+
|------------:|------------:|-------|
271+
| 1.00000000 | -0.12121831 | :a |
272+
| -0.12121831 | 1.00000000 | :b |"
273+
[ds]
274+
(let
275+
[columns (tc/column-names ds)
276+
clean-ds (-> ds
277+
(tc/drop-columns [:zakaznik]))]
278+
(-> (zipmap columns (stats/correlation-matrix (tc/columns clean-ds)))
279+
tc/dataset
280+
(tc/add-column :book columns))))
281+
282+
(defn corr-3-col
283+
"Creates a correlation matrix with two columns of books \n
284+
=> _unnamed [4 3]: \n
285+
| :book-0 | :book-1 | :correlation |
286+
|---------|---------|-------------:|
287+
| :a | :a | 1.00000000 |
288+
| :a | :b | -0.12121831 |
289+
| :b | :a | -0.12121831 |
290+
| :b | :b | 1.00000000 | \n
291+
- `flatten` is used here to make a linear sequence of numbers which should match corresponding variable names. \n
292+
- Since we make pairs of names `((for...[a b])` creates a cartesian product) we need to separate these to individual columns, tc/seperate-column does the trick, refer: https://scicloj.github.io/tablecloth/#separate"
293+
[ds]
294+
(let [names (tc/column-names ds)
295+
mat (flatten (stats/correlation-matrix (tc/columns ds)))]
296+
(-> (tc/dataset {:book (for [a names b names] [a b])
297+
:correlation mat})
298+
(tc/separate-column :book)
299+
(tc/rename-columns {":book-0" :titul-knihy
300+
":book-1" :book-1}))))
301+
302+
303+
;; ### Export helper functions from other namespaces for convenience
304+
305+
(def sanitize-str sanitize-str)
306+
(def merge-csvs merge-csvs)
307+
(def parse-books-from-list parse-books-from-list)
308+
(def sanitize-column-name-str sanitize-column-name-str)
309+
(def parse-csv-date parse-csv-date)
310+
311+
(println "Core helpers loaded.")
203 KB
Binary file not shown.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
(ns data-analysis.book-sales-analysis.data-sources-v2
2+
(:require [tablecloth.api :as tc]
3+
[data-analysis.book-sales-analysis.core-helpers-v2 :as helpers]))
4+
5+
;; ## Central Data Loading
6+
;; All CSV datasets are loaded here with consistent names and simple tc/dataset calls
7+
8+
;; ### Main Orders Data (WooCommerce exports)
9+
10+
#_(def anonymized-presentation-ds
11+
"Anonymized customers full dataset of all orders - for presentation purposes
12+
❗ NOT FOR SHARING"
13+
(tc/dataset
14+
(helpers/merge-csvs
15+
["data/anonymized-customers-only-presentation-v2.csv"]
16+
{:header? true
17+
:separator ","
18+
:key-fn #(keyword (helpers/sanitize-str %))})))
19+
20+
(def anonymized-shareable-ds
21+
"Fully anonymized slice of db - for sharing purposes
22+
✅ SAFE TO SHARE"
23+
(tc/dataset
24+
(helpers/merge-csvs
25+
["src/data_analysis/book_sales_analysis/anonymized-all-sliced-sharing-V2.csv"]
26+
{:header? true
27+
:separator ","
28+
:key-fn #(keyword (helpers/sanitize-str %))})))
29+
30+
;; ### Book Metadata
31+
32+
;; ## Quick Access
33+
;; Most commonly used datasets with short aliases
34+
35+
(def orders-share anonymized-shareable-ds)
36+
#_(def orders-slides anonymized-presentation-ds)
37+
38+
(def corr-matrix-precalculated
39+
(tc/dataset "src/data_analysis/book_sales_analysis/corr-matrix.nippy"))
40+
41+
corr-matrix-precalculated
42+
43+
(prn "Data sources loaded.")
Binary file not shown.
131 KB
Loading

0 commit comments

Comments
 (0)