1+ (ns data-analysis.book-sales-analysis.core-helpers-v2
2+ (:import [java.text Normalizer Normalizer$Form]
3+ [java.io ByteArrayInputStream ObjectInputStream])
4+ (:require [tablecloth.api :as tc]
5+ [tech.v3.dataset :as ds]
6+ [tablecloth.column.api :as tcc]
7+ [clojure.string :as str]
8+ [java-time.api :as jt]
9+ [fastmath.stats :as stats]))
10+
11+ ; ; ## Data Transformation Functions
12+ ; ; Common data processing functions used across multiple analysis files
13+
14+ ; ; ### Scicloj Helpers
15+
16+ (defn merge-csvs [file-list options]
17+ (->> (mapv #(tc/dataset % options) file-list)
18+ (apply tc/concat)))
19+
20+ ; ; ### Column and Content Sanitizers
21+
22+ (defn sanitize-str
23+ " Sanitizes a string for use as a slug or identifier.
24+ Replaces underscores and spaces with hyphens, removes diacritics and parentheses, and converts to lower-case.
25+ Intended for general-purpose text like book titles."
26+ [s]
27+ (if (or (nil? s) (empty? s))
28+ s
29+ (let [hyphens (str/replace s #"_" " -" )
30+ trimmed (str/trim hyphens)
31+ nfd-normalized (Normalizer/normalize trimmed Normalizer$Form/NFD)
32+ no-diacritics (str/replace nfd-normalized #"\p {InCombiningDiacriticalMarks}+" " " )
33+ no-spaces (str/replace no-diacritics #" " " -" )
34+ no-brackets (str/replace no-spaces #"\( |\) " " " )
35+ lower-cased (str/lower-case no-brackets)]
36+ lower-cased)))
37+
38+
39+ (defn sanitize-column-name-str
40+ " Sanitizes a string for use as a dataset column name.
41+ More aggressive than `sanitize-str`, it also converts slashes to hyphens, collapses multiple hyphens,
42+ and removes special substrings like '(YYYY-MM)'."
43+ [s]
44+ (if (or (nil? s) (empty? s))
45+ s
46+ (-> s
47+ (str/replace #"\( YYYY-MM\) " " " ) ; special removal
48+ str/trim
49+ (str/lower-case )
50+ (str/replace #"_" " -" ) ; underscore to hyphens
51+ (str/replace #" " " -" )
52+ (str/replace #"\/ " " -" ) ; slash to hyphens
53+ (str/replace #"-{2,}" " -" ) ; multiple hyphens to one
54+ (#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
55+ (str/replace #"\p {InCombiningDiacriticalMarks}+" " " ) ; no-diacritics
56+ (str/replace #"\( |\) " " " ))))
57+
58+ (defn sanitize-category-str
59+ " Sanitizes a string representing categories.
60+ Similar to other sanitizers, but specifically handles comma-separated lists by removing the space
61+ after a comma (e.g., 'a, b' -> 'a,b')."
62+ [s]
63+ (if (or (nil? s) (empty? s))
64+ s
65+ (-> s
66+ str/trim
67+ (str/lower-case )
68+ (str/replace #"\,\s " " ," ) ; underscore to hyphens
69+ (str/replace #"\s " " -" )
70+ (str/replace #"\/ " " -" ) ; slash to hyphens
71+ (str/replace #"-{2,}" " -" ) ; multiple hyphens to one
72+ (#(Normalizer/normalize % Normalizer$Form/NFD)) ; nfd-normalized
73+ (str/replace #"\p {InCombiningDiacriticalMarks}+" " " ) ; no-diacritics (dočasně)
74+ (str/replace #"\( |\) " " " ))))
75+
76+ (defn parse-book-name [s]
77+ (-> s ; ; proti parse-books bere jen řetězec
78+ (str/replace #"," " " )
79+ (str/replace #"\+ " " " )
80+ (str/trim )
81+ sanitize-category-str
82+ (str/replace #"^3" " k3" )
83+ (str/replace #"^5" " k5" )))
84+
85+
86+ (defn parse-csv-date [date-str]
87+ (let [month-names [" led" " úno" " bře" " dub" " kvě" " čvn" " čvc" " srp" " zář" " říj" " lis" " pro" ]
88+ pad-month #(format " %02d" %)
89+ parse-full-date (fn [s]
90+ (let [month (Integer/parseInt (subs s 3 5 ))]
91+ (str (subs s 6 10 ) " -01-" (pad-month month))))
92+ parse-short-date (fn [s]
93+ (let [[month-str year-str] (str/split s #"\. " )
94+ month (inc (.indexOf month-names month-str))
95+ year (+ 2000 (Integer/parseInt year-str))]
96+ (str year " -01-" (pad-month month))))]
97+ (try
98+ (jt/local-date " yyyy-dd-MM"
99+ (if (> (count date-str) 6 )
100+ (parse-full-date date-str)
101+ (parse-short-date date-str)))
102+ (catch Exception _
103+ (str " Chyba: " date-str)))))
104+
105+ (defn parse-books-from-list
106+ " Parses a book names from string `s` separated by commas into vector of cleaned keywords."
107+ [s]
108+ (if (seq s) (->> (str/split s #",\s\d +" )
109+ (map #(str/replace % #"\d *×\s " " " ))
110+ (map #(str/replace % #"," " " ))
111+ (map #(str/replace % #"\( A\+ E\) |\[ |\] |komplet|a\+ e|\s\( P\+ E\+ A\) |\s\( e\- kniha\) |\s\( P\+ E\) |\s\( P\+ A\) |\s\( E\+ A\) |papír|papir|audio|e\- kniha|taška" " " ))
112+ (map #(str/replace % #"\+ " " " ))
113+ (map #(str/trim %))
114+ (map sanitize-str)
115+ (map #(str/replace % #"\-\- .+$" " " ))
116+ (map #(str/replace % #"\- +$" " " ))
117+ (map #(str/replace % #"^3" " k3" ))
118+ (map #(str/replace % #"^5" " k5" ))
119+ (remove (fn [item] (some (fn [substr] (str/includes? (name item) substr))
120+ [" balicek" " poukaz" " zapisnik" " limitovana-edice" " taska" " aktualizovane-vydani" " cd" " puvodni-vydani/neni-skladem"
121+ " merch" ])))
122+ distinct
123+ (mapv keyword))
124+ nil ))
125+
126+ ; ; ### Metadata Enriching and Convenience Functions
127+
128+ (defn czech-author? [book-title]
129+ (let [czech-books #{:k30-hodin
130+ :k365-anglickych-cool-fraz-a-vyrazov
131+ :k365-anglickych-cool-frazi-a-vyrazu
132+ :bulbem-zachranare
133+ :hacknuta-cestina
134+ :handmade-byznys
135+ :hot
136+ :hry-site-porno
137+ :jak-na-site
138+ :jak-sbalit-zenu-2.0
139+ :konec-prokrastinace
140+ :let-your-english-september
141+ :myty-a-nadeje-digitalniho-sveta
142+ :na-volne-noze
143+ :napoleonuv-vojak
144+ :nedelni-party-s-picassem
145+ :restart-kreativity
146+ :sport-je-bolest
147+ :stat-se-investorem
148+ :temne-pocatky-ceskych-dejin
149+ :uc-jako-umelec
150+ :velka-kniha-fuckupu
151+ :zamilujte-se-do-anglictiny
152+ :pretizeny
153+ :od-chaosu-ke-smyslu
154+ :very-hard-orechy
155+ :heureka! }]
156+ (if (str/starts-with? (str book-title) " book" ) ; ; this is a part used to add flags of Czech books into fully anonymized dataset
157+ (rand-int 2 )
158+ (if (contains? czech-books (keyword book-title)) 1 0 ))))
159+
160+ ; ; ### One-Hot Encoding Functions
161+
162+
163+ (defn onehot-encode-by-customers ; ; FIXME needs refactor and simplification :)
164+ " One-hot encode dataset aggregated by customer.
165+ Each customer gets one row with 0/1 values for each book they bought.
166+ Used for market basket analysis, customer segmentation, etc."
167+ [raw-ds]
168+ (let [; ; First, aggregate all purchases by customer
169+ customer+orders (-> raw-ds
170+ (ds/drop-missing :zakaznik )
171+ (tc/drop-rows #(= " " (str/trim (:zakaznik %))))
172+ (ds/drop-missing :produkt-produkty )
173+ (tc/group-by [:zakaznik ])
174+ (tc/aggregate {:all-products #(str/join " , " (tc/column % :produkt-produkty ))})
175+ (tc/rename-columns {:summary :all-products }))
176+ ; ; Get all unique books from all the lines
177+ all-titles (->> (tc/column customer+orders :all-products )
178+ (mapcat parse-books-from-list)
179+ distinct
180+ sort)
181+ ; ; For each customer create one aggregated row with all purchases in 0/1 format
182+ customers->rows (map
183+ (fn [customer-row]
184+ (let [customer-name (:zakaznik customer-row)
185+ books-bought-set (set (parse-books-from-list (:all-products customer-row)))
186+ one-hot-map (reduce (fn [acc book]
187+ (assoc acc book (if (contains? books-bought-set book) 1 0 )))
188+ {}
189+ all-titles)]
190+ (merge {:zakaznik customer-name}
191+ one-hot-map)))
192+ (tc/rows customer+orders :as-maps ))
193+ ; ; Create new dataset from one-hot data
194+ one-hot-ds (tc/dataset customers->rows)]
195+ ; ; Return dataset with one-hot encoding
196+ one-hot-ds))
197+
198+
199+ ; ; ### Statistical Functions for Apriori Analysis
200+
201+ (defn calculate-support
202+ " Calculate support for a given itemset in a one-hot-encoded dataset.
203+ Support = (rows containing itemset) / (total rows)"
204+ [dataset itemset]
205+ (let [total-transactions (tc/row-count dataset)
206+ transactions-with-itemset (-> dataset
207+ (tc/select-rows (fn [row] (every? #(not (zero? (get row %))) itemset)))
208+ tc/row-count)]
209+ (if (zero? total-transactions)
210+ 0.0
211+ (double (/ transactions-with-itemset total-transactions)))))
212+
213+
214+ ^:kindly/hide-code
215+ (defn calculate-adaptive-coefficient
216+ " Calculates adaptive coefficient for popularity bias correction."
217+ [rules popularity-index]
218+ (let [median-lift (tcc/median (map :lift rules))
219+ max-popularity (apply max (vals popularity-index))
220+ ; ; Calculate coefficient that would reduce a rule with max popularity
221+ ; ; items on both sides by approximately 50% of median lift
222+ target-coefficient (/ (* 0.5 median-lift) (* 2 max-popularity))]
223+ target-coefficient))
224+
225+ ^:kindly/hide-code
226+ (defn improved-adjusted-lift
227+ " Lift adjustment using adaptive popularity bias correction."
228+ [rule popularity-index adaptive-coefficient]
229+ (let [base-lift (:lift rule)
230+ antecedent-items (:antecedent rule)
231+ consequent-items (seq (:consequent rule))
232+ ; ; Average popularity of items in antecedent
233+ antecedent-popularity (tcc/mean (vals (select-keys popularity-index antecedent-items)))
234+ ; ; Average popularity of items in consequent
235+ consequent-popularity (tcc/mean (vals (select-keys popularity-index consequent-items)))
236+ ; ; Dampening factor for popular items
237+ popularity-penalty (+ 1 (* adaptive-coefficient
238+ (+ antecedent-popularity consequent-popularity)))]
239+ ; ; Divide lift by penalty (popular combinations get reduced lift)
240+ (assoc rule :lift (double (/ base-lift popularity-penalty)))))
241+
242+
243+ ; ; ### Visuals
244+
245+ (defn color-hex [support min-support max-support]
246+ (let [min-opacity 20
247+ max-opacity 255
248+ ; ; Map support from [min-support, max-support] to [min-opacity, max-opacity]
249+ opacity (if (= min-support max-support)
250+ ; ; Handle edge case where min and max are the same
251+ (int (/ (+ min-opacity max-opacity) 2 ))
252+ (int (+ min-opacity
253+ (* (- max-opacity min-opacity)
254+ (/ (- support min-support)
255+ (- max-support min-support))))))
256+ ; ; Ensure opacity stays within bounds
257+ clamped-opacity (min max-opacity (max min-opacity opacity))
258+ hex-opacity (format " %02x" clamped-opacity)]
259+ (str " #c1ab55" hex-opacity)))
260+
261+
262+ ; ; ### Correlation functions
263+
264+ (defn corr-a-x-b
265+ " Creates a correlation matrix with book columns and the added :book column \n
266+ - `ds` is dataset \n
267+ Example: \n
268+ => _unnamed [2 3]: \n
269+ | :a | :b | :book |
270+ |------------:|------------:|-------|
271+ | 1.00000000 | -0.12121831 | :a |
272+ | -0.12121831 | 1.00000000 | :b |"
273+ [ds]
274+ (let
275+ [columns (tc/column-names ds)
276+ clean-ds (-> ds
277+ (tc/drop-columns [:zakaznik ]))]
278+ (-> (zipmap columns (stats/correlation-matrix (tc/columns clean-ds)))
279+ tc/dataset
280+ (tc/add-column :book columns))))
281+
282+ (defn corr-3-col
283+ " Creates a correlation matrix with two columns of books \n
284+ => _unnamed [4 3]: \n
285+ | :book-0 | :book-1 | :correlation |
286+ |---------|---------|-------------:|
287+ | :a | :a | 1.00000000 |
288+ | :a | :b | -0.12121831 |
289+ | :b | :a | -0.12121831 |
290+ | :b | :b | 1.00000000 | \n
291+ - `flatten` is used here to make a linear sequence of numbers which should match corresponding variable names. \n
292+ - Since we make pairs of names `((for...[a b])` creates a cartesian product) we need to separate these to individual columns, tc/seperate-column does the trick, refer: https://scicloj.github.io/tablecloth/#separate"
293+ [ds]
294+ (let [names (tc/column-names ds)
295+ mat (flatten (stats/correlation-matrix (tc/columns ds)))]
296+ (-> (tc/dataset {:book (for [a names b names] [a b])
297+ :correlation mat})
298+ (tc/separate-column :book )
299+ (tc/rename-columns {" :book-0" :titul-knihy
300+ " :book-1" :book-1 }))))
301+
302+
303+ ; ; ### Export helper functions from other namespaces for convenience
304+
305+ (def sanitize-str sanitize-str )
306+ (def merge-csvs merge-csvs )
307+ (def parse-books-from-list parse-books-from-list )
308+ (def sanitize-column-name-str sanitize-column-name-str )
309+ (def parse-csv-date parse-csv-date )
310+
311+ (println " Core helpers loaded." )
0 commit comments