-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcluster_gen.m
More file actions
153 lines (135 loc) · 4.74 KB
/
cluster_gen.m
File metadata and controls
153 lines (135 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
clear;
addpath('../ManifoldClassification');
% parameters
n = 5000; % number of points
k = 5; % number of nearest neighbors
dim = 3;
% create data
ppm = [0.5*n 0.5*n];
centers = [5 12; 12 12;];
stdev = 2;
[data2,labels]=makegaussmixnd(centers,stdev,ppm);
labels = labels -1;
% convert to manifold
t = data2(:,1);
u = data2(:,2);
data3 = [t.*cos(t) u t.*sin(t) labels'];
data3 = data3(randperm(size(data3,1)),:);
labels = data3(:,end);
coords = data3(:,1:end-1);
% clean up workspace
clearvars data2 t u stdev;
% k nearest neighbors
d_euc = squareform( pdist(coords, 'euclidean' ) );
[~, nn_idx] = sort(d_euc);
nn_idx = nn_idx(2:k+1,:); % indicies
% clean up workspace
clearvars d_euc;
% adjacency matrix & diversity scores
adj = zeros(n,n); % adjacency matrix
pre_div = zeros(k,n); % preliminary diversity scores
div_scores = zeros(n,n);
for c = 1:n
col = nn_idx(:,c);
for cc = 1:k
idx = col(cc);
adj(idx,c) = 1;
pre_div(cc,c) = ~(labels(idx) == labels(c));
end
div_scores(c,:) = adj(c,:)*(sum(pre_div(:,c))/k);
end
% based on k nearest neighbors and their k nearest neighbours
div = zeros(n,1); % diversity scores [0,1] where 1 is most heterogenous
for c = 1:n
div(c) = sum(div_scores(:,c))/(k);
end
% clean up workspace
clearvars pre_div div_scores idx col c cc;
div_sort = sortrows(div);
max_div = div_sort(end); % maximum diversity score
st = std(div(div>0)); % standard deviation of diversity scores
% range of scores based on standard deviation
range = [max_div-(2*st) max_div];
% % Plot diversity scores & threshold
% plot(1:n, div_sort, '-')
% title('Diversity Scores')
% hold on
% plot([1 n], [range(1) range(1)], 'r-')
% hold off;
% % clean up workspace
% clearvars max_div div_sort st;
% find point cloud
method = zeros(n,1);
idx_range = find(div > range(1));
method(idx_range) = 1;
% find convex hull
cloud_nn = unique(reshape(nn_idx(:,idx_range)',[],1));
cloud = coords(cloud_nn,:);
ch = delaunayn(cloud); % delaunay triangulation
% find points in hull
in_hull = ~isnan(tsearchn(cloud, ch, coords));
cloud_idx = find(in_hull);
method(cloud_idx) = 1;
% % plot manifold with clusters
% plot_labelled(coords, 'a', labels, 'gb', 'Manifold');
% % plot method separation
% plot_labelled(coords, 'a', method, 'kr', 'nn Point Cloud');
full_data = [coords labels method];
reg_idx = full_data(:,5) == 1;
knn_idx = full_data(:,5) == 0;
reg_coords = coords(reg_idx, :);
reg_labels = labels(reg_idx);
knn_coords = coords(knn_idx, :);
knn_labels = labels(knn_idx);
clearvars reg_idx knn_idx full_data method idx_range div max_div;
% create models
knn_model = fitcknn(knn_coords, knn_labels, 'NumNeighbors', 5, ...
'Distance', 'euclidean');
reg_model = glmfit(reg_coords, reg_labels, 'binomial');
% k-fold evaluation
parts = 10;
[kn, ~] = size(knn_coords);
rn = n - kn;
kt = floor(kn/parts);
rt = floor(rn/parts);
kfold_mse = zeros(2, 10);
for c=1:parts
kr = [(c-1)*kt c*kt]; % range of indicies for knn test data
rr = [(c-1)*rt c*rt]; % range of indicies for regression test data
if c==1 % sets training interval to all points after test data
knn_model = fitcknn(knn_coords((kr(2)+1):kn,:), ...
knn_labels((kr(2)+1):kn,:), 'NumNeighbors', 5, ...
'Distance', 'euclidean');
reg_model = glmfit(reg_coords((rr(2)+1):rn,:), ...
reg_labels((rr(2)+1):rn,:), 'binomial');
elseif c==parts % sets training interval to all points before test data
knn_model = fitcknn(knn_coords(1:kr(1),:), knn_labels(1:kr(1),:), ...
'NumNeighbors', 5, 'Distance', 'euclidean');
reg_model = glmfit(reg_coords(1:rr(1),:), reg_labels(1:rr(1)), ...
'binomial');
% compensating for rounding
kr(2) = kn;
rr(2) = rn;
else % sets training interval to points surrounding test interval
knn_model = fitcknn(knn_coords([1:kr(1) (kr(2)+1):kn],:), ...
knn_labels([1:kr(1) (kr(2)+1):kn],:), ...
'NumNeighbors', 5, 'Distance', 'euclidean');
reg_model = glmfit(reg_coords([1:rr(1) (rr(2)+1):rn],:), ...
reg_labels([1:rr(1) (rr(2)+1):rn]), 'binomial');
end
% knn evaluation
knn_pred = predict(knn_model, knn_coords((kr(1)+1):kr(2), :));
knn_comp = [knn_pred knn_labels((kr(1)+1):kr(2))];
kfold_mse(1,c) = sum((knn_comp(:,1)-knn_comp(:,2)).^2)/kn;
% regression evaluation
reg_pred = glmval(reg_model, reg_coords((rr(1)+1):rr(2),:), 'logit');
reg_comp = [reg_pred reg_labels((rr(1)+1):rr(2))];
kfold_mse(2,c) = sum((reg_comp(:,1)-reg_comp(:,2)).^2)/rn;
end
% averaged MSE across all evaluations
avg_mse = [sum(kfold_mse(1,:))/parts; sum(kfold_mse(2,:))/parts];
model_mse = sum(avg_mse)/2;
clearvars kr rr kt rt kn rn parts;
% near close error
% quantifying uncertainty ^ ?
% F1 graph with inflection point