elvisnava
diff --git a/‎Baselines/choetkiertikul.py‎
Lines changed: 163 additions & 272 deletions b/‎Baselines/choetkiertikul.py‎
Lines changed: 163 additions & 272 deletions
diff --git a/‎Baselines/choetkiertikul_helpers.py‎
Lines changed: 3 additions & 3 deletions b/‎Baselines/choetkiertikul_helpers.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Baselines/choetkiertikul_refactored.py‎
Lines changed: 0 additions & 249 deletions b/‎Baselines/choetkiertikul_refactored.py‎
Lines changed: 0 additions & 249 deletions
diff --git a/‎Baselines/data.py‎
Lines changed: 8 additions & 0 deletions b/‎Baselines/data.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Baselines/gp_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎Baselines/gp_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 20 additions & 4 deletions b/‎README.md‎
Lines changed: 20 additions & 4 deletions
@@ -7,7 +7,7 @@
 
 
 def get_user_data(db_access):
-    date_now =  db_access.end
+    date_now = db_access.end
 
     date_string = str(date_now)
 
@@ -135,7 +135,7 @@ def make_pairs(question_stream, # dataframe with all questions (including testin
 
     # get actuall answerer
 
-def overview_score(y_true, y_hat, group):
+def overview_score(y_true, y_hat, group, label=None):
     assert(y_hat.dtype==np.float)
     y_hat_bin = y_hat >=0.5
 
@@ -152,7 +152,7 @@ def overview_score(y_true, y_hat, group):
     mrr_score, mrr_ranks = utils.multi_mrr(out_probs=y_hat, grouped_queries=group, ground_truth=y_true)
     mrr_time = time.time() - t0
 
-    all_info = dict(accuracy=acc, precission = prec, recall = rec, fscore = fscore, prediction_values=hist, mrr_score = mrr_score, mrr_time=mrr_time)
+    all_info = dict(accuracy=acc, precission = prec, recall = rec, fscore = fscore, prediction_values=hist, mrr_score = mrr_score, mrr_time=mrr_time, label=label)
 
     return all_info, mrr_ranks
 
 
@@ -343,6 +343,14 @@ class GetAnswerersStrategy:
 
     # answerer users for ids (sets)
     def __init__(self, votes_threshold=None, _db_access = None, verbose=0, with_score=False, accepted_only=True):
+        """
+
+        :param votes_threshold:
+        :param _db_access:
+        :param verbose:
+        :param with_score:
+        :param accepted_only: if FALSE all answers are returned, if TRUE only accepted answers or answers with a score above the specified threshold
+        """
         self.votes_threshold = votes_threshold
         self.with_score = with_score
         self.accepted_only = accepted_only
 
@@ -84,7 +84,7 @@ def pretrain_gp_ucp(feature_collection, all_events_pretraining_dataframe, hour_t
         assert(not np.isnan(event.answerer_user_id))
         assert(not np.isnan(event.asker_user_id))
 
-        if i%100 ==0 :
+        if i%100 == 0 :
             avg_candidates = np.mean(n_candidates_collector)
             print("Preptraining at {}| on average {} candidates in the last {} suggested_question_events".format(event.answer_date, avg_candidates, len(n_candidates_collector)))
             n_candidates_collector = list()
 
@@ -1,7 +1,23 @@
 # DSLab-StackOverflow
 Repository for our 2019 Data Science Lab project at ETH Zurich, titled "Moving towards Dynamic Recommendation in CQA"
 
-Commands for getting the final GP database data from Clemens' polybox:    
-```wget -O all_events.pickle "https://polybox.ethz.ch/index.php/s/Eq9jOCZSsX3R7Xw/download" ```    
-```wget -O cached_data.pickle "https://polybox.ethz.ch/index.php/s/PMxlsVyqSXw8B3w/download" ```   
-```wget -O pretraining.pickle "https://polybox.ethz.ch/index.php/s/6Sl6iYPsdlRs9Kx/download" ```
+# Installation
+
+Install all requirements from the `requirements.txt`.   
+Follow the tutorial `dump-to-postgres-tutorial.md` to convert the database dump from stackexchange to a postgresql database.   
+
+
+# Baselines
+### Choetkiertikul
+
+Make sure that the postgresql server is running locally. 
+
+Run `Baselines/choetkiertikul.py` to generate the file containing all user-question pairs used for training and testing (this takes a couple of hours). 
+Run the notebook `data-exploration/choetkiertikul.ipynb` to train the random forest and generate plots. 
+
+
+# Novel Approach
+## GP-TOP-K
+Run `gp_user2Lquestion.py` to run GP-TOP-K. A number of command line arguments are available. 
+
+