-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNashville-Data-Cleaning
More file actions
193 lines (142 loc) · 4.17 KB
/
Nashville-Data-Cleaning
File metadata and controls
193 lines (142 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
--cleaning data in SQL queries
--Standardize date format
SELECT *
FROM NashvilleData
SELECT SaleDate, CONVERT(Date, SaleDate)
FROM NashvilleData
ALTER TABLE NashvilleData
ADD SaleDateConverted Date
UPDATE NashvilleData
SET SaleDateConverted = CONVERT(Date, SaleDate)
---Populate Property Address Data
SELECT *
FROM NashvilleData
WHERE PropertyAddress IS NULL
SELECT A.ParcelID, A.PropertyAddress, B.ParcelID, B.PropertyAddress, ISNULL(A.PropertyAddress, B.PropertyAddress)
FROM NashvilleData A
JOIN NashvilleData B
ON A.ParcelID = B.ParcelID
AND A.[UniqueID ] <> B.[UniqueID ]
WHERE A.PropertyAddress IS NULL
UPDATE A
SET PropertyAddress = ISNULL(A.PropertyAddress, B.PropertyAddress)
FROM NashvilleData A
JOIN NashvilleData B
ON A.ParcelID = B.ParcelID
AND A.[UniqueID ] <> B.[UniqueID ]
WHERE A.PropertyAddress IS NULL
--Breaking out address into individual columns (Address, City, State)
SELECT PropertyAddress
FROM NashvilleData
--Using Substring
SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1) AS Address
FROM NashvilleData
SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1) AS Address,
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress)) AS Address
FROM NashvilleData
ALTER TABLE NashvilleData
ADD PropertySplitAddress nvarchar (255)
UPDATE NashvilleData
SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1)
ALTER TABLE NashvilleData
ADD PropertySplitCity nvarchar (255)
UPDATE NashvilleData
SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress))
SELECT PropertySplitAddress, PropertySplitCity
FROM NashvilleData
--Using Parsename
SELECT
PARSENAME(REPLACE(PropertyAddress, ',','.'), 2),
PARSENAME(REPLACE(PropertyAddress, ',','.'), 1)
FROM NashvilleData
ALTER TABLE NashvilleData
ADD PropertySplitAddress1 nvarchar (255)
UPDATE NashvilleData
SET PropertySplitAddress1 = PARSENAME(REPLACE(PropertyAddress, ',','.'), 2)
ALTER TABLE NashvilleData
ADD PropertySplitCity1 nvarchar (255)
UPDATE NashvilleData
SET PropertySplitCity1 = PARSENAME(REPLACE(PropertyAddress, ',','.'), 1)
SELECT PropertySplitAddress1, PropertySplitCity1
FROM NashvilleData
--Separating Owner Address
--Using Parsename
SELECT
PARSENAME(REPLACE(OwnerAddress, ',','.'), 3),
PARSENAME(REPLACE(OwnerAddress, ',','.'), 2),
PARSENAME(REPLACE(OwnerAddress, ',','.'), 1)
FROM NashvilleData
ALTER TABLE NashvilleData
ADD OwnerSplitAddress nvarchar (255)
UPDATE NashvilleData
SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress, ',','.'), 3)
ALTER TABLE NashvilleData
ADD OwnerSplitCity nvarchar (255)
UPDATE NashvilleData
SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress, ',','.'), 2)
ALTER TABLE NashvilleData
ADD OwnerSplitState nvarchar (255)
UPDATE NashvilleData
SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress, ',','.'), 1)
SELECT OwnerSplitAddress, OwnerSplitCity, OwnerSplitState
FROM NashvilleData
--Change Y and N to 'Yes' and 'No' in Sold As Vacant field
SELECT DISTINCT(SoldAsVacant), COUNT(SoldAsVacant)
FROM NashvilleData
GROUP BY SoldAsVacant
ORDER BY 2
SELECT SoldAsVacant,
CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'NO'
ELSE SoldAsVacant
END
FROM NashvilleData
UPDATE NashvilleData
SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'NO'
ELSE SoldAsVacant
END
FROM NashvilleData
SELECT SoldAsVacant
FROM NashvilleData
--Remove Duplicates
--Using row number
WITH RowNumCTE AS(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) row_num
FROM NashvilleData
)
SELECT *
FROM RowNumCTE
WHERE row_num > 1
ORDER BY PropertyAddress
--To delete the duplicate rows
WITH RowNumCTE AS(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) row_num
FROM NashvilleData
)
DELETE
FROM RowNumCTE
WHERE row_num > 1
--Delete unused columns (seek permission before doing this)
ALTER TABLE NashvilleData
DROP COLUMN PropertyAddress, SaleDate, OwnerAddress, TaxDistrict