-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathS3.py
More file actions
104 lines (95 loc) · 3.72 KB
/
S3.py
File metadata and controls
104 lines (95 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/python
#-*-coding:utf-8-*-
'''@author:duncan'''
import Initial as init
import Metric
import numpy as np
import math
import time
class S3:
def __init__(self,users,num,R,k,beta):
# 用户数据
self.users = users
# 子集大小
self.k = k
# 平衡参数
self.beta = beta
# 代表性矩阵
self.R = R
# 总人数
self.num = num
# 分组
def SplitToGroups(self):
# 按照location分组
groups = {}
for i in range(len(self.users)):
location = self.users.loc[i]['location']
if location not in groups.keys():
groups[location] = [i]
else:
groups[location].append(i)
return groups
# 定义组代表性
def Q(self,profiles,groups):
start = time.time()
profiles_groups = {}
for i in profiles:
index = self.users[self.users['userid'] == i].index[0]
location = self.users.loc[index]['location']
if location not in profiles_groups.keys():
profiles_groups[location] = [self.users[(self.users['userid'] == i)].index[0]]
else:
profiles_groups[location].append(self.users[(self.users['userid'] == i)].index[0])
# 对每个group计算P函数值
Q_score = 0
for key in profiles_groups.keys():
cur_group = profiles_groups[key]
orginal_group = groups[key]
# 统计cur_group中每个
temp = self.R[cur_group,:]
P = np.sum(np.mean(temp[:,orginal_group],axis=0),axis=0)
Q_score += math.pow(len(groups[key]),self.beta) * math.pow(P,self.beta)
end = time.time()
print "计算Q_score花费%fs" % (end - start)
return Q_score
# 直接假设已经按照领域划分了group
def SearchWithNoGroups(self):
profiles = set()
results_vector = np.asarray([0 for i in xrange(self.num)])
while len(profiles) < self.k:
# 每次选择使得rc最大的加入
# results = {i:np.sum(np.mean(np.vstack((results_vector,self.R[i])),axis=0),axis=0) / (len(profiles)+1) for i in range(self.num) if i not in profiles}
results = {i:np.sum(results_vector * len(profiles) + self.R[i],axis=0) / self.num for i in range(self.num) if i not in profiles}
profiles.add(max(results.items(),key=lambda key:key[1])[0])
# 更新results_vector
results_vector = (results_vector * len(profiles) + self.R[i]) / len(profiles)
# print len(profiles)
# 将索引转换成用户id
profiles = [self.users.loc[i]['userid'] for i in profiles]
return profiles
# 贪心算法寻找
def Search(self):
groups = self.SplitToGroups()
profiles = set()
count = 0
while len(profiles) < self.k:
results = {self.users.loc[i]['userid']:self.Q(profiles | set([self.users.loc[i]['userid']]),groups) for i in range(len(self.users)) if self.users.loc[i]['userid'] not in profiles}
# 排序results,保留最大值
profiles.add(max(results.items(),key=lambda key:key[1])[0])
count += 1
print count
print profiles
return profiles
def test():
category = "Sports"
users,R,id_list,g = init.Init(category)
metric = Metric.Metrics(users,R,id_list,g)
num = len(users) * 0.05
s3 = S3(users,len(users),R,num,0.6)
start_time = time.time()
profiles = s3.SearchWithNoGroups()
end_time = time.time()
print "cost %f s" % (end_time - start_time)
rc,rt,rscore = metric.RScore(profiles)
print "特征代表性:%f,拓扑结构代表性:%f,rscore:%f" % (rc,rt,rscore)
test()