diff --git a/ansible/group_vars/vagrant.yml b/ansible/group_vars/vagrant.yml index 1b36bd2..a27ca9d 100644 --- a/ansible/group_vars/vagrant.yml +++ b/ansible/group_vars/vagrant.yml @@ -15,3 +15,11 @@ kv_store: etcd etcd_client_port: 2379 docker_port: 2375 swarm_port: 3375 + +influxdb: + host: 192.168.44.11 + port: 8086 + +grafana: + host: 192.168.44.11 + port: 3000 diff --git a/ansible/inventory/vagrant b/ansible/inventory/vagrant index 7dc33e2..46ac1aa 100644 --- a/ansible/inventory/vagrant +++ b/ansible/inventory/vagrant @@ -12,3 +12,9 @@ node-[1:3] [vagrant:children] nodes + +[influxdb_server] +node-1 + +[grafana_server] +node-1 diff --git a/ansible/roles/grafana/defaults/main.yml b/ansible/roles/grafana/defaults/main.yml new file mode 100644 index 0000000..97a49a1 --- /dev/null +++ b/ansible/roles/grafana/defaults/main.yml @@ -0,0 +1,10 @@ +--- +# defaults file for grafana +grafana: + version: 3.0.1-1 + host: 192.168.44.11 + port: 3000 + admin: admin + admin_password: admin + conf_dir: /etc/grafana + root: /var/lib/grafana diff --git a/ansible/roles/grafana/files/grafana.repo b/ansible/roles/grafana/files/grafana.repo new file mode 100644 index 0000000..a2c2eb6 --- /dev/null +++ b/ansible/roles/grafana/files/grafana.repo @@ -0,0 +1,5 @@ +[grafana] +name=grafana +baseurl=https://packagecloud.io/grafana/stable/el/6/$basearch +enabled=1 +gpgcheck=0 diff --git a/ansible/roles/grafana/handlers/main.yml b/ansible/roles/grafana/handlers/main.yml new file mode 100644 index 0000000..b3a2b7d --- /dev/null +++ b/ansible/roles/grafana/handlers/main.yml @@ -0,0 +1,4 @@ +--- +# handlers file for grafana +- name: restart grafana + service: name=grafana-server state=restarted diff --git a/ansible/roles/grafana/meta/main.yml b/ansible/roles/grafana/meta/main.yml new file mode 100644 index 0000000..84fcbd5 --- /dev/null +++ b/ansible/roles/grafana/meta/main.yml @@ -0,0 +1,32 @@ +--- +galaxy_info: + author: Jishnu Vijayan + description: Monitoring solution for upswing + company: DigitalEarns (Infrastructure) + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + # Some suggested licenses: + # - BSD (default) + # - MIT + # - GPLv2 + # - GPLv3 + # - Apache + # - CC-BY + license: license (GPLv2, CC-BY, etc) + min_ansible_version: 1.2 + # + # Below are all platforms currently available. Just uncomment + # the ones that apply to your role. If you don't see your + # platform on this list, let us know and we'll get it added! + # + #platforms: + # + # Below are all categories currently available. Just as with + # the platforms above, uncomment those that apply to your role. + # + #categories: +dependencies: [] + # List your role dependencies here, one per line. + # Be sure to remove the '[]' above if you add dependencies + # to this list. diff --git a/ansible/roles/grafana/tasks/main.yml b/ansible/roles/grafana/tasks/main.yml new file mode 100644 index 0000000..81013f3 --- /dev/null +++ b/ansible/roles/grafana/tasks/main.yml @@ -0,0 +1,47 @@ +--- +- include_vars: ../../telegraf/defaults/main.yml + +- name: configure grafana repo + copy: src=grafana.repo dest=/etc/yum.repos.d/grafana.repo owner=root group=root + +- name: install grafana + yum: name=grafana-{{ grafana.version }} state=present + +- name: configure grafana + template: src=grafana.ini.j2 dest={{ grafana.conf_dir }}/grafana.ini + notify: + - restart grafana + +- meta: flush_handlers + +- name: start grafana-server + service: name=grafana-server enabled=yes state=started + +- name: install python-httplib2.noarch + yum: name=python-httplib2.noarch state=present + +- wait_for: port={{ grafana.port }} delay=10 timeout=20 + +- name: setup upswing datasource in grafana + uri: + url: http://{{ grafana.host }}:{{ grafana.port }}/api/datasources + method: POST + body: "{{ lookup('template','datasource.json') }}" + user: "{{ grafana.admin }}" + password: "{{ grafana.admin_password }}" + force_basic_auth: yes + body_format: json + HEADER_Content-Type: "application/json" + status_code: 200,500 + +- name: set up upswing dashboard + uri: + url: http://{{ grafana.host }}:{{ grafana.port }}/api/dashboards/db + method: POST + body: "{{ lookup('template','upswing_nodes.json') }}" + user: "{{ grafana.admin }}" + password: "{{ grafana.admin_password }}" + force_basic_auth: yes + body_format: json + HEADER_Content-Type: "application/json" + status_code: 200,412 diff --git a/ansible/roles/grafana/templates/datasource.json b/ansible/roles/grafana/templates/datasource.json new file mode 100644 index 0000000..0661fd8 --- /dev/null +++ b/ansible/roles/grafana/templates/datasource.json @@ -0,0 +1,14 @@ + +{ +"name":"upswing_telegraf", +"type":"influxdb", +"access":"direct", +"isDefault":true, +"url":"http://{{ influxdb.host }}:{{ influxdb.port }}", +"user":"{{ telegraf.db.admin }}", +"password":"{{ telegraf.db.admin_password }}", +"database":"{{ telegraf.db.name }}", +"basicAuth":false, +"basicAuthUser":"", +"basicAuthPassword":"" +} diff --git a/ansible/roles/grafana/templates/grafana.ini.j2 b/ansible/roles/grafana/templates/grafana.ini.j2 new file mode 100644 index 0000000..fd2c463 --- /dev/null +++ b/ansible/roles/grafana/templates/grafana.ini.j2 @@ -0,0 +1,267 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +; app_mode = production + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +# +;data = {{ grafana.root }} +# +# Directory where grafana can store logs +# +;logs = /var/log/grafana +# +# Directory where grafana will automatically scan and look for plugins +# +;plugins = {{ grafana.root }}/plugins + +# +#################################### Server #################################### +[server] +# Protocol (http or https) +;protocol = http + +# The ip address to bind to, empty will bind to all interfaces +;http_addr = + +# The http port to use +;http_port = {{ grafana.port }} + +# The public facing domain name used to access grafana from a browser +;domain = localhost + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url +;root_url = %(protocol)s://%(domain)s:%(http_port)s/ + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +;cert_file = +;cert_key = + +#################################### Database #################################### +[database] +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +;password = + +# For "postgres" only, either "disable", "require" or "verify-full" +;ssl_mode = disable + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +#################################### Session #################################### +[session] +# Either "memory", "file", "redis", "mysql", "postgres", default is "file" +;provider = file + +# Provider config options +# memory: not have any config yet +# file: session dir path, is relative to grafana data_path +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=grafana` +# mysql: go-sql-driver/mysql dsn config string, e.g. `user:password@tcp(127.0.0.1:3306)/database_name` +# postgres: user=a password=b host=localhost port=5432 dbname=c sslmode=disable +;provider_config = sessions + +# Session cookie name +;cookie_name = grafana_sess + +# If you use session in https only, default is false +;cookie_secure = false + +# Session life time, default is 86400 +;session_life_time = 86400 + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# Set to false to disable all checks to https://grafana.net +# for new vesions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.net to get latest versions +check_for_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +#################################### Security #################################### +[security] +# default admin user, created on startup +;admin_user = admin + +# default admin password, can be changed before first start of grafana, or in profile settings +;admin_password = admin + +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# Auto-login remember days +;login_remember_days = 7 +;cookie_username = grafana_user +;cookie_remember_name = grafana_remember + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port seperated by spaces) +;data_source_proxy_whitelist = + +[snapshots] +# snapshot sharing options +;external_enabled = true +;external_snapshot_url = https://snapshots-origin.raintank.io +;external_snapshot_name = Publish to snapshot.raintank.io + +#################################### Users #################################### +[users] +# disable user signup / registration +;allow_sign_up = true + +# Allow non admin users to create organizations +;allow_org_create = true + +# Set to true to automatically assign new users to the default organization (id 1) +;auto_assign_org = true + +# Default role new users will be automatically assigned (if disabled above is set to true) +;auto_assign_org_role = Viewer + +# Background text for the user field on the login page +;login_hint = email or username + +#################################### Anonymous Auth ########################## +[auth.anonymous] +# enable anonymous access +;enabled = false + +# specify organization name that should be used for unauthenticated users +;org_name = Main Org. + +# specify role for unauthenticated users +;org_role = Viewer + +#################################### Github Auth ########################## +[auth.github] +;enabled = false +;allow_sign_up = false +;client_id = some_id +;client_secret = some_secret +;scopes = user:email,read:org +;auth_url = https://github.com/login/oauth/authorize +;token_url = https://github.com/login/oauth/access_token +;api_url = https://api.github.com/user +;team_ids = +;allowed_organizations = + +#################################### Google Auth ########################## +[auth.google] +;enabled = false +;allow_sign_up = false +;client_id = some_client_id +;client_secret = some_client_secret +;scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email +;auth_url = https://accounts.google.com/o/oauth2/auth +;token_url = https://accounts.google.com/o/oauth2/token +;api_url = https://www.googleapis.com/oauth2/v1/userinfo +;allowed_domains = + +#################################### Auth Proxy ########################## +[auth.proxy] +;enabled = false +;header_name = X-WEBAUTH-USER +;header_property = username +;auto_sign_up = true + +#################################### Basic Auth ########################## +[auth.basic] +;enabled = true + +#################################### Auth LDAP ########################## +[auth.ldap] +;enabled = false +;config_file = /etc/grafana/ldap.toml + +#################################### SMTP / Emailing ########################## +[smtp] +;enabled = false +;host = localhost:25 +;user = +;password = +;cert_file = +;key_file = +;skip_verify = false +;from_address = admin@grafana.localhost + +[emails] +;welcome_email_on_sign_up = false + +#################################### Logging ########################## +[log] +# Either "console", "file", "syslog". Default is console and file +# Use comma to separate multiple modes, e.g. "console, file" +;mode = console, file + +# Buffer length of channel, keep it as it is if you don't know what it is. +;buffer_len = 10000 + +# Either "Trace", "Debug", "Info", "Warn", "Error", "Critical", default is "Info" +;level = Info + +# For "console" mode only +[log.console] +;level = + +# For "file" mode only +[log.file] +;level = +# This enables automated log rotate(switch of following options), default is true +;log_rotate = true + +# Max line number of single file, default is 1000000 +;max_lines = 1000000 + +# Max size shift of single file, default is 28 means 1 << 28, 256MB +;max_lines_shift = 28 + +# Segment log daily, default is true +;daily_rotate = true + +# Expired days of log file(delete after max days), default is 7 +;max_days = 7 + +#################################### AMPQ Event Publisher ########################## +[event_publisher] +;enabled = false +;rabbitmq_url = amqp://localhost/ +;exchange = grafana_events + +;#################################### Dashboard JSON files ########################## +[dashboards.json] +;enabled = false +;path = {{ grafana.root }}/dashboards diff --git a/ansible/roles/grafana/templates/upswing_nodes.json b/ansible/roles/grafana/templates/upswing_nodes.json new file mode 100644 index 0000000..dec8d2b --- /dev/null +++ b/ansible/roles/grafana/templates/upswing_nodes.json @@ -0,0 +1,557 @@ +{ + +"overwrite": true, + "dashboard": + { + "id": null, + "title": "Upswing", + "originalTitle": "Upswing", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": { + "cpu.system - node-1": "#1F78C1" + }, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "cpu.system - $tag_host", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "host" + ], + "type": "tag" + } + ], + "measurement": "cpu", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "usage_system" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$upswing_hosts$/" + } + ] + }, + { + "alias": "cpu.user - $tag_host", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "host" + ], + "type": "tag" + } + ], + "measurement": "cpu", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "usage_user" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$upswing_hosts$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "tooltip": { + "msResolution": true, + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "cpu.system - node-1": "#1F78C1" + }, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 4, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "mem.available - $tag_host", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "auto" + ], + "type": "time" + }, + { + "params": [ + "host" + ], + "type": "tag" + } + ], + "hide": false, + "measurement": "mem", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "available_percent" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$upswing_hosts$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "tooltip": { + "msResolution": true, + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "cpu.system - node-1": "#1F78C1" + }, + "bars": false, + "datasource": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 5, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "disk.reads - $tag_host", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "auto" + ], + "type": "time" + }, + { + "params": [ + "host" + ], + "type": "tag" + } + ], + "hide": false, + "measurement": "diskio", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "reads" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$upswing_hosts$/" + } + ], + "query": "SELECT mean(\"reads\") FROM \"diskio\" WHERE \"host\" =~ /^$upswing_hosts$/ AND $timeFilter GROUP BY time($interval), \"host\"", + "rawQuery": true + }, + { + "alias": "disk.writes - $tag_host", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "auto" + ], + "type": "time" + }, + { + "params": [ + "host" + ], + "type": "tag" + } + ], + "hide": false, + "measurement": "diskio", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "reads" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$upswing_hosts$/" + } + ], + "query": "SELECT mean(\"writes\") FROM \"diskio\" WHERE \"host\" =~ /^$upswing_hosts$/ AND $timeFilter GROUP BY time($interval), \"host\"", + "rawQuery": true + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "tooltip": { + "msResolution": true, + "shared": true, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [], + "title": "New row" + } + ], + "time": { + "from": "now/d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "current": { + "value": [ + "$__all" + ], + "text": "All", + "tags": [] + }, + "datasource": null, + "hide": 0, + "includeAll": true, + "label": "upswing_hosts", + "multi": true, + "name": "upswing_hosts", + "options": [ + { + "text": "All", + "value": "$__all", + "selected": true + }, + { + "text": "node-1", + "value": "node-1", + "selected": false + }, + { + "text": "node-2", + "value": "node-2", + "selected": false + }, + { + "text": "node-3", + "value": "node-3", + "selected": false + }, + { + "text": "node-4", + "value": "node-4", + "selected": false + } + ], + "query": "SHOW TAG VALUES FROM \"cpu\" WITH KEY = \"host\"", + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "annotations": { + "list": [] + }, + "refresh": "5s", + "schemaVersion": 12, + "version": 9, + "links": [] + } +} diff --git a/ansible/roles/grafana/vars/main.yml b/ansible/roles/grafana/vars/main.yml new file mode 100644 index 0000000..cc6e520 --- /dev/null +++ b/ansible/roles/grafana/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for grafana diff --git a/ansible/roles/influxdb/defaults/main.yml b/ansible/roles/influxdb/defaults/main.yml new file mode 100644 index 0000000..f0b953c --- /dev/null +++ b/ansible/roles/influxdb/defaults/main.yml @@ -0,0 +1,8 @@ +--- +# defaults file for influxdb +influxdb: + version: 0.13.0-1 + host: 192.168.44.11 + port: 8086 + root: /var/lib/influxdb + conf_dir: /etc/influxdb diff --git a/ansible/roles/influxdb/files/influxdb.repo b/ansible/roles/influxdb/files/influxdb.repo new file mode 100644 index 0000000..85e4dae --- /dev/null +++ b/ansible/roles/influxdb/files/influxdb.repo @@ -0,0 +1,6 @@ +[influxdb] +name = InfluxDB Repository - RHEL $releasever +baseurl = https://repos.influxdata.com/rhel/$releasever/$basearch/stable +enabled = 1 +gpgcheck = 1 +gpgkey = https://repos.influxdata.com/influxdb.key diff --git a/ansible/roles/influxdb/handlers/main.yml b/ansible/roles/influxdb/handlers/main.yml new file mode 100644 index 0000000..b793055 --- /dev/null +++ b/ansible/roles/influxdb/handlers/main.yml @@ -0,0 +1,4 @@ +--- +# handlers file for influxdb +- name: restart influxdb + service: name=influxdb state=restarted diff --git a/ansible/roles/influxdb/meta/main.yml b/ansible/roles/influxdb/meta/main.yml new file mode 100644 index 0000000..84fcbd5 --- /dev/null +++ b/ansible/roles/influxdb/meta/main.yml @@ -0,0 +1,32 @@ +--- +galaxy_info: + author: Jishnu Vijayan + description: Monitoring solution for upswing + company: DigitalEarns (Infrastructure) + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + # Some suggested licenses: + # - BSD (default) + # - MIT + # - GPLv2 + # - GPLv3 + # - Apache + # - CC-BY + license: license (GPLv2, CC-BY, etc) + min_ansible_version: 1.2 + # + # Below are all platforms currently available. Just uncomment + # the ones that apply to your role. If you don't see your + # platform on this list, let us know and we'll get it added! + # + #platforms: + # + # Below are all categories currently available. Just as with + # the platforms above, uncomment those that apply to your role. + # + #categories: +dependencies: [] + # List your role dependencies here, one per line. + # Be sure to remove the '[]' above if you add dependencies + # to this list. diff --git a/ansible/roles/influxdb/tasks/main.yml b/ansible/roles/influxdb/tasks/main.yml new file mode 100644 index 0000000..87df82c --- /dev/null +++ b/ansible/roles/influxdb/tasks/main.yml @@ -0,0 +1,14 @@ +--- +- name: configure influxdb repo + copy: src=influxdb.repo dest=/etc/yum.repos.d/influxdb.repo owner=root group=root + +- name: install influxdb + yum: name=influxdb-{{ influxdb.version }} state=present + +- name: start influxdb + service: name=influxdb enabled=yes state=started + +- name: edit influxdb conf + template: src=influxdb.conf.j2 dest={{ influxdb.conf_dir }}/influxdb.conf + notify: + - restart influxdb diff --git a/ansible/roles/influxdb/templates/influxdb.conf.j2 b/ansible/roles/influxdb/templates/influxdb.conf.j2 new file mode 100644 index 0000000..8ce44a1 --- /dev/null +++ b/ansible/roles/influxdb/templates/influxdb.conf.j2 @@ -0,0 +1,216 @@ + +# manually set the hostname +# hostname = "localhost" + +### +### [meta] +### +### Controls the parameters for the Raft consensus group that stores metadata +### about the InfluxDB cluster. +### + +[meta] + # Where the metadata/raft database is stored + dir = "{{ influxdb.root }}/meta" + + retention-autocreate = true + + # If log messages are printed for the meta service + logging-enabled = true + pprof-enabled = false + + # The default duration for leases. + lease-duration = "1m0s" + +### +### [data] +### +### Controls where the actual shard data for InfluxDB lives and how it is +### flushed from the WAL. "dir" may need to be changed to a suitable place +### for your system, but the WAL settings are an advanced configuration. The +### defaults should work for most systems. +### + +[data] + # Controls if this node holds time series data shards in the cluster + enabled = true + + dir = "{{ influxdb.root }}/data" + + # These are the WAL settings for the storage engine >= 0.9.3 + wal-dir = "{{ influxdb.root }}/wal" + wal-logging-enabled = true + data-logging-enabled = true + + # Whether queries should be logged before execution. Very useful for troubleshooting, but will + # log any sensitive data contained within a query. + # query-log-enabled = true + + # Settings for the TSM engine + + # CacheMaxMemorySize is the maximum size a shard's cache can + # reach before it starts rejecting writes. + # cache-max-memory-size = 524288000 + + # CacheSnapshotMemorySize is the size at which the engine will + # snapshot the cache and write it to a TSM file, freeing up memory + # cache-snapshot-memory-size = 26214400 + + # CacheSnapshotWriteColdDuration is the length of time at + # which the engine will snapshot the cache and write it to + # a new TSM file if the shard hasn't received writes or deletes + # cache-snapshot-write-cold-duration = "1h" + + # MinCompactionFileCount is the minimum number of TSM files + # that need to exist before a compaction cycle will run + # compact-min-file-count = 3 + + # CompactFullWriteColdDuration is the duration at which the engine + # will compact all TSM files in a shard if it hasn't received a + # write or delete + # compact-full-write-cold-duration = "24h" + + # MaxPointsPerBlock is the maximum number of points in an encoded + # block in a TSM file. Larger numbers may yield better compression + # but could incur a performance penalty when querying + # max-points-per-block = 1000 + +### +### [cluster] +### +### Controls non-Raft cluster behavior, which generally includes how data is +### shared across shards. +### + +[cluster] + shard-writer-timeout = "5s" # The time within which a remote shard must respond to a write request. + write-timeout = "10s" # The time within which a write request must complete on the cluster. + max-concurrent-queries = 0 # The maximum number of concurrent queries that can run. 0 to disable. + query-timeout = "0s" # The time within a query must complete before being killed automatically. 0s to disable. + max-select-point = 0 # The maximum number of points to scan in a query. 0 to disable. + max-select-series = 0 # The maximum number of series to select in a query. 0 to disable. + max-select-buckets = 0 # The maximum number of buckets to select in an aggregate query. 0 to disable. + +### +### [retention] +### +### Controls the enforcement of retention policies for evicting old data. +### + +[retention] + enabled = true + check-interval = "30m" + +### +### [shard-precreation] +### +### Controls the precreation of shards, so they are available before data arrives. +### Only shards that, after creation, will have both a start- and end-time in the +### future, will ever be created. Shards are never precreated that would be wholly +### or partially in the past. + +[shard-precreation] + enabled = true + check-interval = "10m" + advance-period = "30m" + +### +### Controls the system self-monitoring, statistics and diagnostics. +### +### The internal database for monitoring data is created automatically if +### if it does not already exist. The target retention within this database +### is called 'monitor' and is also created with a retention period of 7 days +### and a replication factor of 1, if it does not exist. In all cases the +### this retention policy is configured as the default for the database. + +[monitor] + store-enabled = true # Whether to record statistics internally. + store-database = "_internal" # The destination database for recorded statistics + store-interval = "5s" # The interval at which to record statistics + +### +### [admin] +### +### Controls the availability of the built-in, web-based admin interface. If HTTPS is +### enabled for the admin interface, HTTPS must also be enabled on the [http] service. +### + +[admin] + enabled = true + bind-address = ":8083" + https-enabled = false + https-certificate = "/etc/ssl/influxdb.pem" + +### +### [http] +### +### Controls how the HTTP endpoints are configured. These are the primary +### mechanism for getting data into and out of InfluxDB. +### + +[http] + enabled = true + bind-address = ":8086" + auth-enabled = false + log-enabled = true + write-tracing = false + pprof-enabled = false + https-enabled = false + https-certificate = "/etc/ssl/influxdb.pem" + max-row-limit = 10000 + +### +### [collectd] +### +### Controls one or many listeners for collectd data. +### + +[[collectd]] + enabled = false + # bind-address = "" + # database = "" + # typesdb = "" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # batch-size = 1000 # will flush if this many points get buffered + # batch-pending = 5 # number of batches that may be pending in memory + # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit + # read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + +### +### [[udp]] +### +### Controls the listeners for InfluxDB line protocol data via UDP. +### + +[[udp]] + enabled = false + # bind-address = "" + # database = "udp" + # retention-policy = "" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # batch-size = 1000 # will flush if this many points get buffered + # batch-pending = 5 # number of batches that may be pending in memory + # batch-timeout = "1s" # will flush at least this often even if we haven't hit buffer limit + # read-buffer = 0 # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + + # set the expected UDP payload size; lower values tend to yield better performance, default is max UDP size 65536 + # udp-payload-size = 65536 + +### +### [continuous_queries] +### +### Controls how continuous queries are run within InfluxDB. +### + +[continuous_queries] + log-enabled = true + enabled = true + # run-interval = "1s" # interval for how often continuous queries will be checked if they need to run diff --git a/ansible/roles/influxdb/vars/main.yml b/ansible/roles/influxdb/vars/main.yml new file mode 100644 index 0000000..753f126 --- /dev/null +++ b/ansible/roles/influxdb/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for influxdb diff --git a/ansible/roles/telegraf/defaults/main.yml b/ansible/roles/telegraf/defaults/main.yml new file mode 100644 index 0000000..ac1fb22 --- /dev/null +++ b/ansible/roles/telegraf/defaults/main.yml @@ -0,0 +1,9 @@ +--- +# defaults file for telegraf +telegraf: + version: 0.13.1-1 + conf_dir: /etc/telegraf + db: + name: upswing + admin: admin + admin_password: admin123 diff --git a/ansible/roles/telegraf/handlers/main.yml b/ansible/roles/telegraf/handlers/main.yml new file mode 100644 index 0000000..e8bd5e9 --- /dev/null +++ b/ansible/roles/telegraf/handlers/main.yml @@ -0,0 +1,4 @@ +--- + +- name: restart telegraf + service: name=telegraf state=restarted diff --git a/ansible/roles/telegraf/meta/main.yml b/ansible/roles/telegraf/meta/main.yml new file mode 100644 index 0000000..84fcbd5 --- /dev/null +++ b/ansible/roles/telegraf/meta/main.yml @@ -0,0 +1,32 @@ +--- +galaxy_info: + author: Jishnu Vijayan + description: Monitoring solution for upswing + company: DigitalEarns (Infrastructure) + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + # Some suggested licenses: + # - BSD (default) + # - MIT + # - GPLv2 + # - GPLv3 + # - Apache + # - CC-BY + license: license (GPLv2, CC-BY, etc) + min_ansible_version: 1.2 + # + # Below are all platforms currently available. Just uncomment + # the ones that apply to your role. If you don't see your + # platform on this list, let us know and we'll get it added! + # + #platforms: + # + # Below are all categories currently available. Just as with + # the platforms above, uncomment those that apply to your role. + # + #categories: +dependencies: [] + # List your role dependencies here, one per line. + # Be sure to remove the '[]' above if you add dependencies + # to this list. diff --git a/ansible/roles/telegraf/tasks/main.yml b/ansible/roles/telegraf/tasks/main.yml new file mode 100644 index 0000000..bcbacb2 --- /dev/null +++ b/ansible/roles/telegraf/tasks/main.yml @@ -0,0 +1,15 @@ +--- + +- name: configure telegraf repo + copy: src=roles/influxdb/files/influxdb.repo dest=/etc/yum.repos.d/influxdb.repo owner=root group=root + +- name: install telegraf + yum: name=telegraf-{{ telegraf.version }} state=present + +- name: start telegraf deamon + service: name=telegraf enabled=yes state=started + +- name: configure telegraf to report to influxdb + template: src=telegraf.conf.j2 dest={{ telegraf.conf_dir }}/telegraf.conf + notify: + - restart telegraf diff --git a/ansible/roles/telegraf/templates/telegraf.conf.j2 b/ansible/roles/telegraf/templates/telegraf.conf.j2 new file mode 100644 index 0000000..71b0706 --- /dev/null +++ b/ansible/roles/telegraf/templates/telegraf.conf.j2 @@ -0,0 +1,176 @@ +# Telegraf Configuration + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at + ## most metric_batch_size metrics. + metric_batch_size = 1000 + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## Run telegraf in debug mode + debug = false + ## Run telegraf in quiet mode + quiet = false + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + urls = ["http://{{ influxdb.host }}:{{ influxdb.port }}"] # required + ## The target database for metrics (telegraf will create it if not exists). + database = "{{ telegraf.db.name }}" # required + ## Precision of writes, valid values are "ns", "us" (or "µs"), "ms", "s", "m", "h". + ## note: using "s" precision greatly improves InfluxDB compression. + precision = "s" + + ## Retention policy to write to. + retention_policy = "default" + ## Write consistency (clusters only), can be: "any", "one", "quorom", "all" + write_consistency = "any" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + username = "{{ telegraf.db.admin }}" + password = "{{ telegraf.db.admin_password }}" + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## Comment this line if you want the raw CPU time metrics + fielddrop = ["time_*"] + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you do not need disk serial numbers. + # skip_serial_number = true + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + +# # Read metrics about docker containers +# [[inputs.docker]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/docker.sock" +# ## Only collect metrics for these containers, collect all if empty +# container_names = [] +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" + +# # Read metrics from one or more commands that can output to stdout +# [[inputs.exec]] +# ## Commands array +# commands = ["/tmp/test.sh", "/usr/bin/mycollector --foo=bar"] +# +# ## Timeout for each command to complete. +# timeout = "5s" +# +# ## measurement name suffix (for separating different commands) +# name_suffix = "_mycollector" +# +# ## Data format to consume. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + +# ## Optional SSL Config +# # ssl_ca = "/etc/telegraf/ca.pem" +# # ssl_cert = "/etc/telegraf/cert.pem" +# # ssl_key = "/etc/telegraf/key.pem" +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false + +# # Collect kernel snmp counters and network interface statistics +# [[inputs.nstat]] +# ## file paths for proc files. If empty default paths will be used: +# ## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6 +# ## These can also be overridden with env variables, see README. +# proc_net_netstat = "" +# proc_net_snmp = "" +# proc_net_snmp6 = "" +# ## dump metrics with 0 values too +# dump_zeros = true + +# ## urls to ping +# urls = ["www.google.com"] # required +# ## number of pings to send per collection (ping -c ) +# count = 1 # required +# ## interval, in s, at which to ping. 0 == default (ping -i ) +# ping_interval = 0.0 +# ## ping timeout, in s. 0 == no timeout (ping -W ) +# timeout = 1.0 +# ## interface to send ping from (ping -I ) +# interface = "" diff --git a/ansible/roles/telegraf/vars/main.yml b/ansible/roles/telegraf/vars/main.yml new file mode 100644 index 0000000..e668808 --- /dev/null +++ b/ansible/roles/telegraf/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for telegraf diff --git a/ansible/site.yml b/ansible/site.yml index 2863dc8..4487968 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -55,3 +55,21 @@ - swarm_agent tags: - swarm + +- hosts: influxdb_server + roles: + - influxdb + tags: + - influxdb + +- hosts: all + roles: + - telegraf + tags: + - telegraf + +- hosts: grafana_server + roles: + - grafana + tags: + - grafana