Skip to content

Commit 62c9196

Browse files
committed
updated readme
1 parent 20d7b2e commit 62c9196

14 files changed

Lines changed: 2584 additions & 2853 deletions

Baselines/choetkiertikul.py

Lines changed: 163 additions & 272 deletions
Large diffs are not rendered by default.

Baselines/choetkiertikul_helpers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
def get_user_data(db_access):
10-
date_now = db_access.end
10+
date_now = db_access.end
1111

1212
date_string = str(date_now)
1313

@@ -135,7 +135,7 @@ def make_pairs(question_stream, # dataframe with all questions (including testin
135135

136136
# get actuall answerer
137137

138-
def overview_score(y_true, y_hat, group):
138+
def overview_score(y_true, y_hat, group, label=None):
139139
assert(y_hat.dtype==np.float)
140140
y_hat_bin = y_hat >=0.5
141141

@@ -152,7 +152,7 @@ def overview_score(y_true, y_hat, group):
152152
mrr_score, mrr_ranks = utils.multi_mrr(out_probs=y_hat, grouped_queries=group, ground_truth=y_true)
153153
mrr_time = time.time() - t0
154154

155-
all_info = dict(accuracy=acc, precission = prec, recall = rec, fscore = fscore, prediction_values=hist, mrr_score = mrr_score, mrr_time=mrr_time)
155+
all_info = dict(accuracy=acc, precission = prec, recall = rec, fscore = fscore, prediction_values=hist, mrr_score = mrr_score, mrr_time=mrr_time, label=label)
156156

157157
return all_info, mrr_ranks
158158

Baselines/choetkiertikul_refactored.py

Lines changed: 0 additions & 249 deletions
This file was deleted.

Baselines/data.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,14 @@ class GetAnswerersStrategy:
343343

344344
# answerer users for ids (sets)
345345
def __init__(self, votes_threshold=None, _db_access = None, verbose=0, with_score=False, accepted_only=True):
346+
"""
347+
348+
:param votes_threshold:
349+
:param _db_access:
350+
:param verbose:
351+
:param with_score:
352+
:param accepted_only: if FALSE all answers are returned, if TRUE only accepted answers or answers with a score above the specified threshold
353+
"""
346354
self.votes_threshold = votes_threshold
347355
self.with_score = with_score
348356
self.accepted_only = accepted_only

Baselines/gp_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def pretrain_gp_ucp(feature_collection, all_events_pretraining_dataframe, hour_t
8484
assert(not np.isnan(event.answerer_user_id))
8585
assert(not np.isnan(event.asker_user_id))
8686

87-
if i%100 ==0 :
87+
if i%100 == 0 :
8888
avg_candidates = np.mean(n_candidates_collector)
8989
print("Preptraining at {}| on average {} candidates in the last {} suggested_question_events".format(event.answer_date, avg_candidates, len(n_candidates_collector)))
9090
n_candidates_collector = list()

README.md

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,23 @@
11
# DSLab-StackOverflow
22
Repository for our 2019 Data Science Lab project at ETH Zurich, titled "Moving towards Dynamic Recommendation in CQA"
33

4-
Commands for getting the final GP database data from Clemens' polybox:
5-
```wget -O all_events.pickle "https://polybox.ethz.ch/index.php/s/Eq9jOCZSsX3R7Xw/download" ```
6-
```wget -O cached_data.pickle "https://polybox.ethz.ch/index.php/s/PMxlsVyqSXw8B3w/download" ```
7-
```wget -O pretraining.pickle "https://polybox.ethz.ch/index.php/s/6Sl6iYPsdlRs9Kx/download" ```
4+
# Installation
5+
6+
Install all requirements from the `requirements.txt`.
7+
Follow the tutorial `dump-to-postgres-tutorial.md` to convert the database dump from stackexchange to a postgresql database.
8+
9+
10+
# Baselines
11+
### Choetkiertikul
12+
13+
Make sure that the postgresql server is running locally.
14+
15+
Run `Baselines/choetkiertikul.py` to generate the file containing all user-question pairs used for training and testing (this takes a couple of hours).
16+
Run the notebook `data-exploration/choetkiertikul.ipynb` to train the random forest and generate plots.
17+
18+
19+
# Novel Approach
20+
## GP-TOP-K
21+
Run `gp_user2Lquestion.py` to run GP-TOP-K. A number of command line arguments are available.
22+
23+

0 commit comments

Comments
 (0)