forked from OlaShabalina/aws-data-analytics-workshop
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtemplate.yml
More file actions
144 lines (136 loc) · 4.46 KB
/
template.yml
File metadata and controls
144 lines (136 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
AWSTemplateFormatVersion: '2010-09-09'
Description: Step 3 - DataBrew project using Glue Data Catalog and scoped S3 access
Resources:
DataBrewRole:
Type: AWS::IAM::Role
Properties:
RoleName: movie-databrew-role
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Principal:
Service: databrew.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: ScopedAccessForDataBrew
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- s3:ListBucket
Resource: !Sub "arn:aws:s3:::movie-data-bucket-${AWS::AccountId}-${AWS::Region}"
- Effect: Allow
Action:
- s3:GetObject
- s3:PutObject
- s3:DeleteObject
Resource: !Sub "arn:aws:s3:::movie-data-bucket-${AWS::AccountId}-${AWS::Region}/*"
- Effect: Allow
Action:
- glue:GetDatabase
- glue:GetTable
- glue:GetPartitions
Resource:
- !Sub "arn:aws:glue:${AWS::Region}:${AWS::AccountId}:catalog"
- !Sub "arn:aws:glue:${AWS::Region}:${AWS::AccountId}:database/movies-workshop-db"
- !Sub "arn:aws:glue:${AWS::Region}:${AWS::AccountId}:table/movies-workshop-db/movies"
DataBrewDataset:
Type: AWS::DataBrew::Dataset
Properties:
Name: movies-dataset
Input:
DataCatalogInputDefinition:
DatabaseName: movies-workshop-db
TableName: movies
DataBrewProject:
Type: AWS::DataBrew::Project
Properties:
Name: !Sub "databrew-project"
DatasetName: !Ref DataBrewDataset
RoleArn: !GetAtt DataBrewRole.Arn
RecipeName: !Ref DataBrewRecipe
Sample:
Type: FIRST_N
Size: 500
DataBrewRecipe:
Type: AWS::DataBrew::Recipe
Properties:
Name: workshop-recipe
Description: Recipe for normalizing genres and spoken_languages
Steps:
- Action:
Operation: SPLIT_COLUMN_SINGLE_DELIMITER
Parameters:
sourceColumn: genres
pattern: '; '
limit: '6'
isText: 'TRUE'
includeInSplit: 'true'
- Action:
Operation: UNPIVOT
Parameters:
sourceColumns: '["genres_1","genres_2","genres_3","genres_4","genres_5","genres_6","genres_7"]'
unpivotColumn: genre_index
valueColumn: genre
- Action:
Operation: DELETE
Parameters:
sourceColumns: '["genre_index"]'
- Action:
Operation: REMOVE_MISSING
Parameters:
sourceColumn: genre
- Action:
Operation: SPLIT_COLUMN_SINGLE_DELIMITER
Parameters:
sourceColumn: spoken_languages
pattern: '; '
limit: '4'
isText: 'TRUE'
includeInSplit: 'true'
- Action:
Operation: UNPIVOT
Parameters:
sourceColumns: >-
["spoken_languages_1","spoken_languages_2","spoken_languages_3","spoken_languages_4","spoken_languages_5"]
unpivotColumn: lang_index
valueColumn: spoken_languages
- Action:
Operation: DELETE
Parameters:
sourceColumns: '["lang_index"]'
- Action:
Operation: REMOVE_MISSING
Parameters:
sourceColumn: spoken_languages
DataBrewJob:
Type: AWS::DataBrew::Job
DependsOn:
- DataBrewProject
- DataBrewRole
Properties:
Name: movies-clean-job
Type: RECIPE
ProjectName: !Ref DataBrewProject
RoleArn: !GetAtt DataBrewRole.Arn
Outputs:
- Location:
Bucket: !Sub "movie-data-bucket-${AWS::AccountId}-${AWS::Region}"
Key: "clean/movies/"
BucketOwner: !Ref AWS::AccountId
Format: CSV
FormatOptions:
Csv:
Delimiter: ","
Outputs:
ProjectName:
Value: !Ref DataBrewProject
Description: The name of the DataBrew project
DatasetName:
Value: !Ref DataBrewDataset
Description: The name of the DataBrew dataset
DataBrewRoleArn:
Value: !GetAtt DataBrewRole.Arn
Description: IAM Role used by DataBrew