|
15 | 15 |
|
16 | 16 | import gp_features |
17 | 17 | from gp_utils import * |
18 | | - |
19 | | -cached_data = data.DataHandleCached() |
| 18 | +from collections import defaultdict |
| 19 | + |
| 20 | +def sample_open_questions(open_questions, random_from_cdf, gt_question): |
| 21 | + """ |
| 22 | + sample open questions from the histogram |
| 23 | + param: open_questions: are all open questions at that point of time |
| 24 | + param: random_from_cdf: is the histogram distribution |
| 25 | + param: sample_size: how many to sample |
| 26 | + returns: open questions which were sampled |
| 27 | + """ |
| 28 | + age_vals = open_questions["question_age"].values |
| 29 | + uni, counts = np.unique(random_from_cdf, return_counts=True) |
| 30 | + val_before = 0 |
| 31 | + final_inds = [] |
| 32 | + for r in range(len(uni)): |
| 33 | + val = uni[r] |
| 34 | + val_set = set(np.where(age_vals<val)[0]).intersection(np.where(age_vals>val_before)[0]) |
| 35 | + val_before=val |
| 36 | + if len(val_set)>counts[r]: |
| 37 | + subset = np.random.choice(list(val_set), counts[r], replace=False) |
| 38 | + else: |
| 39 | + subset = list(val_set) |
| 40 | + if r<len(uni)-1: |
| 41 | + counts[r+1] += counts[r]-len(val_set) |
| 42 | + else: # last one reached |
| 43 | + nr_missing = counts[r]-len(val_set) |
| 44 | + val_set = np.where(age_vals>val)[0] |
| 45 | + if len(val_set)>nr_missing: |
| 46 | + rand_of_leftover = np.random.choice(val_set, nr_missing, replace=False) |
| 47 | + subset.extend(rand_of_leftover) |
| 48 | + else: |
| 49 | + subset.extend(val_set) |
| 50 | + final_inds.extend(subset) |
| 51 | + manually = 0 |
| 52 | + if gt_question not in final_inds: |
| 53 | + final_inds.append(gt_question) |
| 54 | + manually = 1 |
| 55 | + print("manually added") |
| 56 | + final_inds_index = open_questions.index[final_inds] |
| 57 | + final_inds_index = sorted(final_inds_index) |
| 58 | + open_questions = open_questions.loc[final_inds_index] |
| 59 | + return open_questions, manually |
| 60 | + |
| 61 | +# PARAMETER: |
| 62 | +redo_database_dumps = False |
| 63 | +redo_histogram = False |
| 64 | +# parameters for suggested questions |
| 65 | +hour_threshold_suggested_answer = 24 |
| 66 | +only_open_questions_suggestable = True |
| 67 | +filter_nan_asker_id = True |
| 68 | +# output directory (must exist) |
| 69 | +save_dir = "burel_data" |
| 70 | +# number of negatives samples per positive |
| 71 | +NR_NEG = 100 |
| 72 | + |
| 73 | +# paths for cached data |
| 74 | +fp = "../cache" |
| 75 | +all_events_file = os.path.join(fp, "gp/all_events.pickle") |
| 76 | +cached_data_file = os.path.join(fp, "gp/cached_data.pickle") |
| 77 | + |
| 78 | +if redo_database_dumps: |
| 79 | + all_events_dataframe = data_utils.all_answer_events_dataframe(start_time=None, end_time=None, time_delta_scores_after_post=time_delta_scores_after_posts, filter_empty_asker=filter_nan_asker, filter_empty_target_user=filter_nan_answerer) |
| 80 | + all_events_dataframe.to_pickle(all_events_file) |
| 81 | + |
| 82 | + cached_data = data.DataHandleCached() |
| 83 | + with open(cached_data_file, "wb") as f: |
| 84 | + pickle.dump(cached_data, f) |
| 85 | +else: |
| 86 | + all_events_dataframe = pd.read_pickle(all_events_file) |
| 87 | + |
| 88 | + with open(cached_data_file, "rb") as f: |
| 89 | + cached_data = pickle.load(f) |
| 90 | + |
| 91 | +# define data and feature handles |
20 | 92 | data_handle = data.Data() |
21 | 93 |
|
22 | 94 | feature_collection = gp_features.GP_Feature_Collection( |
23 | 95 | gp_features.GP_Features_affinity(), |
24 | | -gp_features.GP_Features_TTM(), |
25 | 96 | gp_features.GP_Features_Question(), |
26 | 97 | gp_features.GP_Features_user()) |
27 | 98 |
|
28 | | -# parameters for suggested questions |
29 | | -hour_threshold_suggested_answer = 24 |
30 | | -only_open_questions_suggestable = False |
31 | | -filter_nan_asker_id = True |
32 | | - |
33 | | -save_dir = "baseline_data" |
34 | 99 |
|
35 | | -start_time = None # data_utils.make_datetime("01.01.2012 00:01") |
36 | | -end_time = data_utils.make_datetime("01.01.2016 00:01") # data_utils.make_datetime("01.03.2012 00:01") |
| 100 | +# start and end of data |
| 101 | +start_time = data_utils.make_datetime("01.01.2012 00:01") |
| 102 | +end_time = data_utils.make_datetime("01.01.2017 00:01") # data_utils.make_datetime("01.03.2012 00:01") |
37 | 103 |
|
38 | 104 | all_feates_collector = list() |
39 | | - |
40 | 105 | n_candidates_collector = list() |
41 | 106 |
|
42 | | -save_every = 10000 |
| 107 | +save_every = 300 |
43 | 108 | q_a_pair_counter = 1 |
44 | 109 |
|
| 110 | +## Approximate questionage distribution |
| 111 | +if redo_histogram: |
| 112 | + questionage_table = data_handle.query("SELECT a.id, (answercreationdate-CreationDate) as questionage FROM (SELECT parentid as Id, creationdate as answercreationdate FROM Posts WHERE PostTypeId=2) a LEFT JOIN Posts b ON a.Id=b.Id;") |
| 113 | + questionage_table["questionage"] = questionage_table["questionage"].dt.days + (questionage_table["questionage"].dt.seconds)/(24*60*60) |
| 114 | + age_vals = questionage_table["questionage"].values |
| 115 | + age_vals = age_vals[age_vals>0] |
| 116 | + age_vals = age_vals[age_vals<100] |
| 117 | + hist, bins = np.histogram(age_vals, bins=500) |
| 118 | + bin_midpoints = bins[1:] # + np.diff(bins)/2 |
| 119 | + cdf = np.cumsum(hist) |
| 120 | + cdf = cdf / cdf[-1] |
| 121 | + values = np.random.rand(100) |
| 122 | + value_bins = np.searchsorted(cdf, values) |
| 123 | + random_from_cdf = bin_midpoints[value_bins] |
| 124 | + with open("random_from_cdf.pickle", "wb") as outfile: |
| 125 | + pickle.dump(random_from_cdf, outfile) |
| 126 | +else: |
| 127 | + with open("random_from_cdf.pickle", "rb") as outfile: |
| 128 | + random_from_cdf = pickle.load(outfile) |
| 129 | + |
| 130 | + |
| 131 | +user_dic = defaultdict(int) |
| 132 | + |
| 133 | + |
| 134 | +# START ITERATING THROUGH DATA |
| 135 | +prev_answertime = start_time |
45 | 136 | for i, event in enumerate(data_utils.all_answer_events_iterator(timedelta(days=2), start_time=start_time, end_time=end_time)): |
46 | 137 | if np.isnan(event.answerer_user_id) or np.isnan(event.asker_user_id): |
47 | 138 | continue |
|
50 | 141 | avg_candidates = np.mean(n_candidates_collector) |
51 | 142 | print("Preptraining at {}| on average {} candidates in the last {} suggested_question_events".format(event.answer_date, avg_candidates, len(n_candidates_collector))) |
52 | 143 | n_candidates_collector = list() |
53 | | - |
54 | | - if is_user_answers_suggested_event(event, hour_threshold_suggested_answer): |
| 144 | + |
| 145 | + # only add to data if user has answered more than five questions and the answer is more than 12 hours after the last one |
| 146 | + if user_dic[event.answerer_user_id] >=5 and event.answer_date> prev_answertime + timedelta(hours = 12): |
| 147 | + open_questions = get_suggestable_questions(event.answer_date, cached_data, only_open_questions_suggestable, hour_threshold_suggested_answer, filter_nan_asker_id) |
| 148 | + # add question age |
| 149 | + question_dates = [pd.Timestamp(x) for x in open_questions["question_date"].values] |
| 150 | + open_questions["question_age"] = [event.answer_date - question_event_time for question_event_time in question_dates] |
| 151 | + open_questions["question_age"] = (open_questions["question_age"].dt.days + (open_questions["question_age"].dt.seconds)/(24*60*60)) |
55 | 152 |
|
56 | | - suggestable_questions = get_suggestable_questions(event.answer_date, cached_data, only_open_questions_suggestable, hour_threshold_suggested_answer, filter_nan_asker_id) |
57 | | - if len(suggestable_questions) ==0: |
58 | | - # warnings.warn("For answer id {} (to question {}) there was not a single suggestable question".format(event.answer_id, event.question_id)) |
| 153 | + gt_ind = np.where(open_questions.question_id == event.question_id)[0] |
| 154 | + if len(open_questions) ==0 or len(gt_ind)==0: |
| 155 | + print("Warning: question already answered or For answer id {} (to question {}) there was not a single suggestable question".format(event.answer_id, event.question_id)) |
59 | 156 | continue |
60 | | - |
| 157 | + if len(open_questions) <= NR_NEG: |
| 158 | + suggestable_questions = open_questions |
| 159 | + manually = 0 |
| 160 | + else: |
| 161 | + print("sampling") |
| 162 | + suggestable_questions, manually = sample_open_questions(open_questions, random_from_cdf, gt_ind[0]) |
| 163 | + |
| 164 | + assert(np.any(suggestable_questions.question_id == event.question_id)) |
| 165 | + |
61 | 166 | n_candidates_collector.append(len(suggestable_questions)) |
62 | 167 |
|
63 | | - # erst appenden wenn ueber einer bestimmten zeit? |
| 168 | + # append to feature and label list |
64 | 169 | feats = feature_collection.compute_features(event.answerer_user_id, suggestable_questions, event.answer_date) |
65 | 170 | label = suggestable_questions.question_id.values == event.question_id |
66 | 171 |
|
67 | 172 | # add some more information |
68 | 173 | feats["question_id"] = suggestable_questions.question_id.values.tolist() # remember question ids |
69 | 174 | feats["decision_time"] = q_a_pair_counter # for MRR need to remember groups |
70 | 175 | feats["label"] = label.astype(int) |
| 176 | + feats["manually_added"] = manually |
71 | 177 | feats["answer_date"] = pd.Series([event.answer_date for _ in range(len(feats))]) |
72 | 178 |
|
73 | 179 | all_feates_collector.append(feats) |
74 | 180 |
|
75 | 181 | assert(np.sum(np.asarray(label).astype(int))==1) |
76 | | - |
| 182 | + q_a_pair_counter+=1 |
| 183 | + prev_answertime = event.answer_date |
| 184 | + |
| 185 | + # save inbetween and clear variables in order to backup |
| 186 | + if (q_a_pair_counter) % save_every==0: |
| 187 | + save_name = "feature_data_"+str((q_a_pair_counter+1)//save_every)+".csv" |
| 188 | + features_table = pd.concat(all_feates_collector, axis=0) |
| 189 | + features_table.to_csv(os.path.join(save_dir, save_name), index=False) |
| 190 | + print("Successfully saved data inbetween", save_name) |
| 191 | + del features_table |
| 192 | + del all_feates_collector |
| 193 | + all_feates_collector = list() |
77 | 194 |
|
78 | 195 | feature_collection.update_pos_event(event) # update features in any case |
79 | | - |
80 | | - # save inbetween and clear variables in order to backup |
81 | | - if (q_a_pair_counter+1) % save_every==0: |
82 | | - save_name = "feature_data_"+str((q_a_pair_counter+1)//save_every)+".csv" |
83 | | - features_table = pd.concat(all_feates_collector, axis=0) |
84 | | - features_table.to_csv(os.path.join(save_dir, save_name), index=False) |
85 | | - print("Successfully saved data inbetween", save_name) |
86 | | - del features_table |
87 | | - del all_feates_collector |
88 | | - all_feates_collector = list() |
| 196 | + user_dic[event.answerer_user_id] += 1 |
89 | 197 |
|
90 | 198 |
|
91 | | - q_a_pair_counter+=1 |
| 199 | + |
92 | 200 |
|
93 | | -if (q_a_pair_counter+1) % save_every != 0: # last batch hasn't been saved |
| 201 | +if (q_a_pair_counter) % save_every != 0: # last batch hasn't been saved |
94 | 202 | save_name = "feature_data_"+str((q_a_pair_counter+1)//save_every + 1)+".csv" |
95 | 203 | features_table = pd.concat(all_feates_collector, axis=0) |
96 | 204 | features_table.to_csv(os.path.join(save_dir, save_name), index=False) |
|
0 commit comments