aws-data-analytics-workshop/steps/step3-databrew/template.yml at main · oz-cloudtools-meetup/aws-data-analytics-workshop · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
AWSTemplateFormatVersion: '2010-09-09'
Description: Step 3 - DataBrew project using Glue Data Catalog and scoped S3 access

Resources:
  DataBrewRole:
    Type: AWS::IAM::Role
    Properties:
      RoleName: movie-databrew-role
      AssumeRolePolicyDocument:
        Version: "2012-10-17"
        Statement:
          - Effect: Allow
            Principal:
              Service: databrew.amazonaws.com
            Action: sts:AssumeRole
      Policies:
        - PolicyName: ScopedAccessForDataBrew
          PolicyDocument:
            Version: "2012-10-17"
            Statement:
              - Effect: Allow
                Action:
                  - s3:ListBucket
                Resource: !Sub "arn:aws:s3:::movie-data-bucket-${AWS::AccountId}-${AWS::Region}"
              - Effect: Allow
                Action:
                  - s3:GetObject
                  - s3:PutObject
                  - s3:DeleteObject
                Resource: !Sub "arn:aws:s3:::movie-data-bucket-${AWS::AccountId}-${AWS::Region}/*"
              - Effect: Allow
                Action:
                  - glue:GetDatabase
                  - glue:GetTable
                  - glue:GetPartitions
                Resource:
                  - !Sub "arn:aws:glue:${AWS::Region}:${AWS::AccountId}:catalog"
                  - !Sub "arn:aws:glue:${AWS::Region}:${AWS::AccountId}:database/movies-workshop-db"
                  - !Sub "arn:aws:glue:${AWS::Region}:${AWS::AccountId}:table/movies-workshop-db/movies"

  DataBrewDataset:
    Type: AWS::DataBrew::Dataset
    Properties:
      Name: movies-dataset
      Input:
        DataCatalogInputDefinition:
          DatabaseName: movies-workshop-db
          TableName: movies

  DataBrewProject:
    Type: AWS::DataBrew::Project
    Properties:
      Name: !Sub "databrew-project"
      DatasetName: !Ref DataBrewDataset
      RoleArn: !GetAtt DataBrewRole.Arn
      RecipeName: !Ref DataBrewRecipe
      Sample:
        Type: FIRST_N
        Size: 500

  DataBrewRecipe:
    Type: AWS::DataBrew::Recipe
    Properties:
      Name: workshop-recipe
      Description: Recipe for normalizing genres and spoken_languages
      Steps:
        - Action:
            Operation: SPLIT_COLUMN_SINGLE_DELIMITER
            Parameters:
              sourceColumn: genres
              pattern: '; '
              limit: '6'
              isText: 'TRUE'
              includeInSplit: 'true'
        - Action:
            Operation: UNPIVOT
            Parameters:
              sourceColumns: '["genres_1","genres_2","genres_3","genres_4","genres_5","genres_6","genres_7"]'
              unpivotColumn: genre_index
              valueColumn: genre
        - Action:
            Operation: DELETE
            Parameters:
              sourceColumns: '["genre_index"]'
        - Action:
            Operation: REMOVE_MISSING
            Parameters:
              sourceColumn: genre
        - Action:
            Operation: SPLIT_COLUMN_SINGLE_DELIMITER
            Parameters:
              sourceColumn: spoken_languages
              pattern: '; '
              limit: '4'
              isText: 'TRUE'
              includeInSplit: 'true'
        - Action:
            Operation: UNPIVOT
            Parameters:
              sourceColumns: >-
                ["spoken_languages_1","spoken_languages_2","spoken_languages_3","spoken_languages_4","spoken_languages_5"]
              unpivotColumn: lang_index
              valueColumn: spoken_languages
        - Action:
            Operation: DELETE
            Parameters:
              sourceColumns: '["lang_index"]'
        - Action:
            Operation: REMOVE_MISSING
            Parameters:
              sourceColumn: spoken_languages

  DataBrewJob:
    Type: AWS::DataBrew::Job
    DependsOn:
      - DataBrewProject
      - DataBrewRole
    Properties:
      Name: movies-clean-job
      Type: RECIPE
      ProjectName: !Ref DataBrewProject
      RoleArn: !GetAtt DataBrewRole.Arn
      Outputs:
        - Location:
            Bucket: !Sub "movie-data-bucket-${AWS::AccountId}-${AWS::Region}"
            Key: "clean/movies/"
            BucketOwner: !Ref AWS::AccountId
          Format: CSV
          FormatOptions:
            Csv:
              Delimiter: ","

Outputs:
  ProjectName:
    Value: !Ref DataBrewProject
    Description: The name of the DataBrew project

  DatasetName:
    Value: !Ref DataBrewDataset
    Description: The name of the DataBrew dataset

  DataBrewRoleArn:
    Value: !GetAtt DataBrewRole.Arn
    Description: IAM Role used by DataBrew