Python_bkcde/LAB5_assignment3.py at master · batemansogq/Python_bkcde · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot') # Look Pretty

#
# INFO: This dataset has call records for 10 users tracked over the course of 3 years.
# Your job is to find out where the users likely live at!


def showandtell(title=None):
  if title != None: plt.savefig(title + ".png", bbox_inches='tight', dpi=300)
  plt.show()
  # exit()

def clusterInfo(model):
  print "Cluster Analysis Inertia: ", model.inertia_
  print '------------------------------------------'
  for i in range(len(model.cluster_centers_)):
    print "\n  Cluster ", i
    print "    Centroid ", model.cluster_centers_[i]
    print "    #Samples ", (model.labels_==i).sum() # NumPy Power

# Find the cluster with the least # attached nodes
def clusterWithFewestSamples(model):
  # Ensure there's at least on cluster...
  minSamples = len(model.labels_)
  minCluster = 0
  for i in range(len(model.cluster_centers_)):
    if minSamples > (model.labels_==i).sum():
      minCluster = i
      minSamples = (model.labels_==i).sum()
  print "\n  Cluster With Fewest Samples: ", minCluster
  return (model.labels_==minCluster)


def doKMeans(data, clusters=0):
  #
  # TODO: Be sure to only feed in Lat and Lon coordinates to the KMeans algo, since none of the other
  # data is suitable for your purposes. Since both Lat and Lon are (approximately) on the same scale,
  # no feature scaling is required. Print out the centroid locations and add them onto your scatter
  # plot. Use a distinguishable marker and color.
  #
  # Hint: Make sure you fit ONLY the coordinates, and in the CORRECT order (lat first). This is part
  # of your domain expertise. Also, *YOU* need to instantiate (and return) the variable named `model`
  # here, which will be a SKLearn K-Means model for this to work.
  #
  df_u1 = data[['TowerLat', 'TowerLon']]

  from sklearn.cluster import KMeans
  model = KMeans(n_clusters=clusters)
  model.fit(df_u1)

  labels = model.predict(df_u1)
  centroids = model.cluster_centers_

  print(centroids)

  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.scatter(df_u1.TowerLat,df_u1.TowerLon, c='purple', marker='o', alpha=0.2)
  ax = fig.add_subplot(111)
  ax.scatter(centroids[:, 0],centroids[:, 1], c='r', marker='^',)
  ax.set_title('Week Calls (<5pm) with Centroid')

  return model


#
# TODO: Load up the dataset and take a peek at its head and dtypes.
# Convert the date using pd.to_datetime, and the time using pd.to_timedelta
#
df_CDR = pd.read_csv('C:/Users/mckinns/Documents/GitHub/DAT210x/Module5/Datasets/CDR.csv',
                     sep=',', header=0)

df_CDR.CallDate = pd.to_datetime(df_CDR.CallDate, errors='coerce')
df_CDR.CallTime = pd.to_timedelta(df_CDR.CallTime, errors='coerce')
df_CDR.Duration = pd.to_timedelta(df_CDR.Duration, errors='coerce')


#
# TODO: Create a unique list of of the phone-number values (users) stored in the
# "In" column of the dataset, and save it to a variable called `unique_numbers`.
# Manually check through unique_numbers to ensure the order the numbers appear is
# the same order they appear (uniquely) in your dataset:
#
unique_numbers = df_CDR['In'].unique()


#
# INFO: The locations map above should be too "busy" to really wrap your head around. This
# is where domain expertise comes into play. Your intuition tells you that people are likely
# to behave differently on weekends:
#
# On Weekdays:
#   1. People probably don't go into work
#   2. They probably sleep in late on Saturday
#   3. They probably run a bunch of random errands, since they couldn't during the week
#   4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
#   1. People probably are at work during normal working hours
#   2. They probably are at home in the early morning and during the late night
#   3. They probably spend time commuting between work and home everyday


print "\n\nExamining person: ", 0
#
# TODO: Create a slice called user1 that filters to only include dataset records where the
# "In" feature (user phone number) is equal to the first number on your unique list above
#
user1 = df_CDR[(df_CDR.In==unique_numbers[8])]
# TODO: Alter your slice so that it includes only Weekday (Mon-Fri) values.
#
user1 = pd.get_dummies(user1,columns=['DOW'])

#
# TODO: The idea is that the call was placed before 5pm. From Midnight-730a, the user is
# probably sleeping and won't call / wake up to take a call. There should be a brief time
# in the morning during their commute to work, then they'll spend the entire day at work.
# So the assumption is that most of the time is spent either at work, or in 2nd, at home.
#
user1 = user1[(user1.DOW_Sat != 1) & (user1.DOW_Sun != 1)  ]
user1 = user1[(user1.CallTime < "17:00:00")]
user1.reset_index()
#
# TODO: Plot the Cell Towers the user connected to
#
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLat,user1.TowerLon, c='g', marker='o', alpha=0.2)
ax.set_title('Week Calls (<5pm)')

#
# INFO: Run K-Means with K=3 or K=4. There really should only be a two areas of concentration. If you
# notice multiple areas that are "hot" (multiple areas the usr spends a lot of time at that are FAR
# apart from one another), then increase K=5, with the goal being that all centroids except two will
# sweep up the annoying outliers and not-home, not-work travel occasions. the other two will zero in
# on the user's approximate home location and work locations. Or rather the location of the cell
# tower closest to them.....
model = doKMeans(user1, 3)


print(model.labels_)
print(model.cluster_centers_)

# 32.721986,-96.8927757
# u1 - 32.9000009  -96.90951639
#u2 - 32.69557708 -96.93522725
# u3 - 32.77992299 -96.89338791
# u4 - 32.85371225 -96.8472893
#u5 - 32.92195886 -96.75768121
# u6 - 32.81198486 -96.87034706
# u7 - 32.75203793 -96.74437494
# u8 - 32.72097347 -96.83039184
# u9 - 32.72145328 -96.89115458
# u10 - 32.98500948 -96.80262338


#make a table display
ps = pd.Series(model.labels_)
counts = ps.value_counts()
print counts


#
# INFO: Print out the mean CallTime value for the samples belonging to the cluster with the LEAST
# samples attached to it. If our logic is correct, the cluster with the MOST samples will be work.
# The cluster with the 2nd most samples will be home. And the K=3 cluster with the least samples
# should be somewhere in between the two. What time, on average, is the user in between home and
# work, between the midnight and 5pm?
midWayClusterIndices = clusterWithFewestSamples(model)
midWaySamples = user1[midWayClusterIndices]
print "    Its Waypoint Time: ", midWaySamples.CallTime.mean()


#
# Let's visualize the results!
# First draw the X's for the clusters:
ax.scatter(model.cluster_centers_[:,1], model.cluster_centers_[:,0],
           s=169, c='r', marker='x', alpha=0.8, linewidths=2)
#
# Then save the results:
showandtell('Weekday Calls Centroids')  # Comment this line out when you're ready to proceed

# create booline array value
lt_bol = model.labels_  == 2
# make a series for DF
Bo = pd.Series(lt_bol, name='bools')
#filter the df to just that results
user1[Bo.values]

user1['CallTime'].describe()