Skip to content

Commit 20d7b2e

Browse files
committed
burel baseline code added
1 parent e200638 commit 20d7b2e

3 files changed

Lines changed: 249 additions & 54 deletions

File tree

Baselines/baseline_burel.py

Lines changed: 141 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -15,33 +15,124 @@
1515

1616
import gp_features
1717
from gp_utils import *
18-
19-
cached_data = data.DataHandleCached()
18+
from collections import defaultdict
19+
20+
def sample_open_questions(open_questions, random_from_cdf, gt_question):
21+
"""
22+
sample open questions from the histogram
23+
param: open_questions: are all open questions at that point of time
24+
param: random_from_cdf: is the histogram distribution
25+
param: sample_size: how many to sample
26+
returns: open questions which were sampled
27+
"""
28+
age_vals = open_questions["question_age"].values
29+
uni, counts = np.unique(random_from_cdf, return_counts=True)
30+
val_before = 0
31+
final_inds = []
32+
for r in range(len(uni)):
33+
val = uni[r]
34+
val_set = set(np.where(age_vals<val)[0]).intersection(np.where(age_vals>val_before)[0])
35+
val_before=val
36+
if len(val_set)>counts[r]:
37+
subset = np.random.choice(list(val_set), counts[r], replace=False)
38+
else:
39+
subset = list(val_set)
40+
if r<len(uni)-1:
41+
counts[r+1] += counts[r]-len(val_set)
42+
else: # last one reached
43+
nr_missing = counts[r]-len(val_set)
44+
val_set = np.where(age_vals>val)[0]
45+
if len(val_set)>nr_missing:
46+
rand_of_leftover = np.random.choice(val_set, nr_missing, replace=False)
47+
subset.extend(rand_of_leftover)
48+
else:
49+
subset.extend(val_set)
50+
final_inds.extend(subset)
51+
manually = 0
52+
if gt_question not in final_inds:
53+
final_inds.append(gt_question)
54+
manually = 1
55+
print("manually added")
56+
final_inds_index = open_questions.index[final_inds]
57+
final_inds_index = sorted(final_inds_index)
58+
open_questions = open_questions.loc[final_inds_index]
59+
return open_questions, manually
60+
61+
# PARAMETER:
62+
redo_database_dumps = False
63+
redo_histogram = False
64+
# parameters for suggested questions
65+
hour_threshold_suggested_answer = 24
66+
only_open_questions_suggestable = True
67+
filter_nan_asker_id = True
68+
# output directory (must exist)
69+
save_dir = "burel_data"
70+
# number of negatives samples per positive
71+
NR_NEG = 100
72+
73+
# paths for cached data
74+
fp = "../cache"
75+
all_events_file = os.path.join(fp, "gp/all_events.pickle")
76+
cached_data_file = os.path.join(fp, "gp/cached_data.pickle")
77+
78+
if redo_database_dumps:
79+
all_events_dataframe = data_utils.all_answer_events_dataframe(start_time=None, end_time=None, time_delta_scores_after_post=time_delta_scores_after_posts, filter_empty_asker=filter_nan_asker, filter_empty_target_user=filter_nan_answerer)
80+
all_events_dataframe.to_pickle(all_events_file)
81+
82+
cached_data = data.DataHandleCached()
83+
with open(cached_data_file, "wb") as f:
84+
pickle.dump(cached_data, f)
85+
else:
86+
all_events_dataframe = pd.read_pickle(all_events_file)
87+
88+
with open(cached_data_file, "rb") as f:
89+
cached_data = pickle.load(f)
90+
91+
# define data and feature handles
2092
data_handle = data.Data()
2193

2294
feature_collection = gp_features.GP_Feature_Collection(
2395
gp_features.GP_Features_affinity(),
24-
gp_features.GP_Features_TTM(),
2596
gp_features.GP_Features_Question(),
2697
gp_features.GP_Features_user())
2798

28-
# parameters for suggested questions
29-
hour_threshold_suggested_answer = 24
30-
only_open_questions_suggestable = False
31-
filter_nan_asker_id = True
32-
33-
save_dir = "baseline_data"
3499

35-
start_time = None # data_utils.make_datetime("01.01.2012 00:01")
36-
end_time = data_utils.make_datetime("01.01.2016 00:01") # data_utils.make_datetime("01.03.2012 00:01")
100+
# start and end of data
101+
start_time = data_utils.make_datetime("01.01.2012 00:01")
102+
end_time = data_utils.make_datetime("01.01.2017 00:01") # data_utils.make_datetime("01.03.2012 00:01")
37103

38104
all_feates_collector = list()
39-
40105
n_candidates_collector = list()
41106

42-
save_every = 10000
107+
save_every = 300
43108
q_a_pair_counter = 1
44109

110+
## Approximate questionage distribution
111+
if redo_histogram:
112+
questionage_table = data_handle.query("SELECT a.id, (answercreationdate-CreationDate) as questionage FROM (SELECT parentid as Id, creationdate as answercreationdate FROM Posts WHERE PostTypeId=2) a LEFT JOIN Posts b ON a.Id=b.Id;")
113+
questionage_table["questionage"] = questionage_table["questionage"].dt.days + (questionage_table["questionage"].dt.seconds)/(24*60*60)
114+
age_vals = questionage_table["questionage"].values
115+
age_vals = age_vals[age_vals>0]
116+
age_vals = age_vals[age_vals<100]
117+
hist, bins = np.histogram(age_vals, bins=500)
118+
bin_midpoints = bins[1:] # + np.diff(bins)/2
119+
cdf = np.cumsum(hist)
120+
cdf = cdf / cdf[-1]
121+
values = np.random.rand(100)
122+
value_bins = np.searchsorted(cdf, values)
123+
random_from_cdf = bin_midpoints[value_bins]
124+
with open("random_from_cdf.pickle", "wb") as outfile:
125+
pickle.dump(random_from_cdf, outfile)
126+
else:
127+
with open("random_from_cdf.pickle", "rb") as outfile:
128+
random_from_cdf = pickle.load(outfile)
129+
130+
131+
user_dic = defaultdict(int)
132+
133+
134+
# START ITERATING THROUGH DATA
135+
prev_answertime = start_time
45136
for i, event in enumerate(data_utils.all_answer_events_iterator(timedelta(days=2), start_time=start_time, end_time=end_time)):
46137
if np.isnan(event.answerer_user_id) or np.isnan(event.asker_user_id):
47138
continue
@@ -50,47 +141,64 @@
50141
avg_candidates = np.mean(n_candidates_collector)
51142
print("Preptraining at {}| on average {} candidates in the last {} suggested_question_events".format(event.answer_date, avg_candidates, len(n_candidates_collector)))
52143
n_candidates_collector = list()
53-
54-
if is_user_answers_suggested_event(event, hour_threshold_suggested_answer):
144+
145+
# only add to data if user has answered more than five questions and the answer is more than 12 hours after the last one
146+
if user_dic[event.answerer_user_id] >=5 and event.answer_date> prev_answertime + timedelta(hours = 12):
147+
open_questions = get_suggestable_questions(event.answer_date, cached_data, only_open_questions_suggestable, hour_threshold_suggested_answer, filter_nan_asker_id)
148+
# add question age
149+
question_dates = [pd.Timestamp(x) for x in open_questions["question_date"].values]
150+
open_questions["question_age"] = [event.answer_date - question_event_time for question_event_time in question_dates]
151+
open_questions["question_age"] = (open_questions["question_age"].dt.days + (open_questions["question_age"].dt.seconds)/(24*60*60))
55152

56-
suggestable_questions = get_suggestable_questions(event.answer_date, cached_data, only_open_questions_suggestable, hour_threshold_suggested_answer, filter_nan_asker_id)
57-
if len(suggestable_questions) ==0:
58-
# warnings.warn("For answer id {} (to question {}) there was not a single suggestable question".format(event.answer_id, event.question_id))
153+
gt_ind = np.where(open_questions.question_id == event.question_id)[0]
154+
if len(open_questions) ==0 or len(gt_ind)==0:
155+
print("Warning: question already answered or For answer id {} (to question {}) there was not a single suggestable question".format(event.answer_id, event.question_id))
59156
continue
60-
157+
if len(open_questions) <= NR_NEG:
158+
suggestable_questions = open_questions
159+
manually = 0
160+
else:
161+
print("sampling")
162+
suggestable_questions, manually = sample_open_questions(open_questions, random_from_cdf, gt_ind[0])
163+
164+
assert(np.any(suggestable_questions.question_id == event.question_id))
165+
61166
n_candidates_collector.append(len(suggestable_questions))
62167

63-
# erst appenden wenn ueber einer bestimmten zeit?
168+
# append to feature and label list
64169
feats = feature_collection.compute_features(event.answerer_user_id, suggestable_questions, event.answer_date)
65170
label = suggestable_questions.question_id.values == event.question_id
66171

67172
# add some more information
68173
feats["question_id"] = suggestable_questions.question_id.values.tolist() # remember question ids
69174
feats["decision_time"] = q_a_pair_counter # for MRR need to remember groups
70175
feats["label"] = label.astype(int)
176+
feats["manually_added"] = manually
71177
feats["answer_date"] = pd.Series([event.answer_date for _ in range(len(feats))])
72178

73179
all_feates_collector.append(feats)
74180

75181
assert(np.sum(np.asarray(label).astype(int))==1)
76-
182+
q_a_pair_counter+=1
183+
prev_answertime = event.answer_date
184+
185+
# save inbetween and clear variables in order to backup
186+
if (q_a_pair_counter) % save_every==0:
187+
save_name = "feature_data_"+str((q_a_pair_counter+1)//save_every)+".csv"
188+
features_table = pd.concat(all_feates_collector, axis=0)
189+
features_table.to_csv(os.path.join(save_dir, save_name), index=False)
190+
print("Successfully saved data inbetween", save_name)
191+
del features_table
192+
del all_feates_collector
193+
all_feates_collector = list()
77194

78195
feature_collection.update_pos_event(event) # update features in any case
79-
80-
# save inbetween and clear variables in order to backup
81-
if (q_a_pair_counter+1) % save_every==0:
82-
save_name = "feature_data_"+str((q_a_pair_counter+1)//save_every)+".csv"
83-
features_table = pd.concat(all_feates_collector, axis=0)
84-
features_table.to_csv(os.path.join(save_dir, save_name), index=False)
85-
print("Successfully saved data inbetween", save_name)
86-
del features_table
87-
del all_feates_collector
88-
all_feates_collector = list()
196+
user_dic[event.answerer_user_id] += 1
89197

90198

91-
q_a_pair_counter+=1
199+
92200

93-
if (q_a_pair_counter+1) % save_every != 0: # last batch hasn't been saved
201+
if (q_a_pair_counter) % save_every != 0: # last batch hasn't been saved
94202
save_name = "feature_data_"+str((q_a_pair_counter+1)//save_every + 1)+".csv"
95203
features_table = pd.concat(all_feates_collector, axis=0)
96204
features_table.to_csv(os.path.join(save_dir, save_name), index=False)

Baselines/baseline_burel_rf.py

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,18 @@
55
import os
66
import sys
77
import utils
8+
import data_utils
89

9-
SPLIT = 0.8
10+
# define training and testing times
1011
success_n = 5
12+
SPLIT_OPTION = "time" # time or fold
13+
TRAIN_END_DATE = data_utils.make_datetime("01.01.2015 00:01")
14+
TEST_START_DATE = data_utils.make_datetime("01.01.2015 00:01")
15+
TEST_END_DATE = data_utils.make_datetime("01.01.2017 00:01")
16+
SPLIT_FOLD = 0.9
1117

1218
# Load data
13-
data_dir = "baseline_data"
14-
# df_read = pd.read_csv("baseline_data/feature_data_1.csv")
19+
data_dir = "burel_data"
1520
files = sorted(os.listdir(data_dir))
1621
print("available files:", files)
1722
dfs = []
@@ -24,30 +29,45 @@
2429
print("is sorted?", all(np.diff(df_read["decision_time"])>=0))
2530

2631
# split in train and test
27-
num_events = len(np.unique(df_read["decision_time"].values, return_counts=True)[1])
28-
print("overall, in the data there are ", num_events, "question-answer events")
29-
print("on average, for each event there are ", len(df_read)//num_events, " open questions")
30-
cutoff = int(num_events * SPLIT)
31-
df_train = df_read[df_read["decision_time"]<cutoff]
32-
df_test = df_read[df_read["decision_time"]>=cutoff]
32+
events, _ = np.unique(df_read["decision_time"].values, return_counts=True)
33+
print("Overall, in the data there are ", len(events), "question-answer events")
34+
print("On average, for each event there are ", len(df_read)//len(events), " open questions")
35+
# split
36+
if SPLIT_OPTION=="fold":
37+
cutoff = events[int(len(events) * SPLIT_FOLD)]
38+
df_train = df_read[df_read["decision_time"]<cutoff]
39+
df_test = df_read[df_read["decision_time"]>=cutoff]
40+
elif SPLIT_OPTION=="time":
41+
df_read["answer_date"] = pd.to_datetime(df_read["answer_date"])
42+
df_train = df_read[df_read["answer_date"] < TRAIN_END_DATE]
43+
df_test = df_read[df_read["answer_date"] >= TEST_START_DATE]
44+
df_test = df_test[df_test["answer_date"] < TEST_END_DATE]
45+
46+
# print information about data
47+
num_events = len(np.unique(df_train["decision_time"].values))
48+
print("In TRAIN there are ", num_events, "question-answer events")
49+
print("IN TRAIN for each event there are ", len(df_train)//num_events, " open questions on average")
50+
num_events = len(np.unique(df_test["decision_time"].values))
51+
print("In TEST there are ", num_events, "question-answer events")
52+
print("IN TEST for each event there are ", len(df_test)//num_events, " open questions on average")
3353

3454
# Prepare training set
35-
X_train = df_train.drop(['label', 'decision_time', 'question_id', "answer_date"], axis=1)
55+
#'question_age', 'votes_mean', 'votes_sd', 'votes_sum', 'votes_max', 'votes_min', 'new'
56+
X_train = df_train.drop(['label', 'decision_time', 'question_id', "answer_date", 'tag_popularity', 'manually_added'], axis=1)
3657
features = X_train.columns.tolist()
3758
X_train = np.asarray(X_train)
3859
Y_train = df_train['label'].values
3960
G_train = df_train['decision_time'].values
40-
# print(sorted(np.unique(G_train//100)))
4161

4262
# Prepare testing set
43-
X_test = df_test.drop(['label', 'decision_time', 'question_id', "answer_date"], axis=1) # df_test[["questionage"]] #
63+
X_test = df_test.drop(['label', 'decision_time', 'question_id', "answer_date", 'tag_popularity', 'manually_added'], axis=1) # df_test[["questionage"]] #
4464
X_test = np.asarray(X_test)
4565
Y_test = df_test['label'].values
4666
G_test = df_test['decision_time'].values
47-
# print(sorted(np.unique(G_test//100)))
67+
M_test = df_test['manually_added'].values
4868
assert(len(X_train)==len(Y_train))
4969

50-
print("Size of training set: ", len(Y_train), " Test set:", len(Y_test))
70+
print("Size of training set: ", len(Y_train), " Test set:", len(Y_test), ", Nr features:", X_test.shape)
5171
class_counts = np.unique(Y_train, return_counts=True)[1]
5272
print("Class imbalance: 1:", class_counts[0]//class_counts[1])
5373

@@ -76,24 +96,27 @@
7696

7797
print("----------- Compute scores ----------------")
7898

99+
# Train scores
79100
probs_train = clf.predict_proba(X_train)
80101
score, _ = utils.mrr(probs_train[:,1], G_train, Y_train)
81102
print("Training MRR:", score)
82103

104+
# Test scores
83105
probs_test = clf.predict_proba(X_test)
84106
pred_targets = probs_test[:,1]
85-
score, ranks = utils.mrr(pred_targets, G_test, Y_test)
107+
score, ranks, nr_cands = utils.mrr_manually(pred_targets, G_test, Y_test, M_test)
86108
chance_mrr = utils.mrr3(out_probs=np.random.permutation(pred_targets), grouped_queries=G_test, ground_truth=Y_test)
87109
print ("Testing MRR: ", score, ", Chance level:", chance_mrr)
88110

89-
success_score = utils.success_at_n(pred_targets, G_test, Y_test, n=success_n)
111+
# Success at n
112+
success_score = utils.success_at_n_manually(pred_targets, G_test, Y_test, M_test, n=success_n)
90113
success_chance = utils.success_at_n(np.random.permutation(pred_targets), G_test, Y_test, n=success_n)
91114
print("Success at ", success_n,":", success_score, ", Chance level:", success_chance)
92115

93116

94117
# # SAVING TEST FEATURES
95-
# df_test_gt = df_test
96-
# df_test_gt["rank"] = ranks.tolist()
97-
# print(df_test_gt.head())
98-
99-
# df_test_gt.to_csv("ranks_features.csv")
118+
df_test_gt = df_test
119+
df_test_gt["rank"] = ranks.tolist()
120+
df_test_gt["n_candidates"] = nr_cands.tolist()
121+
print(df_test_gt.head())
122+
df_test_gt.to_csv("results/new_burel_ranks_2015-16_wQA_wman.csv")

0 commit comments

Comments
 (0)