diff --git a/.gitignore b/.gitignore index acc3b58..500b37c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,25 @@ -# Ignore local settings -localsettings.py +# Python binaries +*.pyc + +# Sphinx +docs/_build +docs/_build_html + +# OSX garbage +.DS_STORE -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] +# Scrapy Cluster +kafka-monitor/logs/* +redis-monitor/logs/* +crawler/logs/* +crawler/main.log +localsettings.py -# C extensions -*.so +# Vagrant test VM +.vagrant +local/ +bin/ +pip-selfcheck.json # Distribution / packaging .Python @@ -24,38 +37,4 @@ sdist/ var/ *.egg-info/ .installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover - -# Translations -*.mo -*.pot - -# Django stuff: -*.log - -# Sphinx documentation -docs/_build/ -docs/_build_html/ - -# PyBuilder -target/ +*.egg \ No newline at end of file diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 0000000..76de45e --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,29 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +Vagrant.require_version ">= 1.7.4" + +Vagrant.configure(2) do |config| + + # Configure general VM options + config.vm.provider "virtualbox" do |vb| + vb.memory = 2048 + vb.cpus = 4 + end + + config.vm.define 'scdev' do |node| + node.vm.box = 'ubuntu/trusty64' + node.vm.hostname = 'scdev' + node.vm.network "private_network", ip: "192.168.33.99" + node.vm.provision "ansible" do |ansible| + ansible.verbose = true + ansible.groups = { + "kafka" => ["scdev"], + "zookeeper" => ["scdev"], + "redis" => ["scdev"], + "all_groups:children" => ["kafka", "zookeeper", "redis"] + } + ansible.playbook = "ansible/scrapy-cluster.yml" + end + end +end diff --git a/ansible/kafka.yml b/ansible/kafka.yml new file mode 100644 index 0000000..199d8de --- /dev/null +++ b/ansible/kafka.yml @@ -0,0 +1,12 @@ +--- + +- name: Kafka Brokers + hosts: kafka + + sudo: yes + + vars: + - kafka_host_list: "{{ groups['kafka'] }}" + - zookeeper_host_list: "{{ groups['zookeeper'] }}" + roles: + - kafka diff --git a/ansible/redis.yml b/ansible/redis.yml new file mode 100644 index 0000000..8a5f92a --- /dev/null +++ b/ansible/redis.yml @@ -0,0 +1,9 @@ +--- + +- name: Redis Master + hosts: redis + + sudo: yes + + roles: + - redis \ No newline at end of file diff --git a/ansible/roles/java/defaults/main.yml b/ansible/roles/java/defaults/main.yml new file mode 100644 index 0000000..eb2dbb0 --- /dev/null +++ b/ansible/roles/java/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# file: roles/common/defaults/main.yml + +# The specific version of Oracle Java that can be found in YUM +java_version: 1.7.0_71 + diff --git a/ansible/roles/java/files/java.sh b/ansible/roles/java/files/java.sh new file mode 100644 index 0000000..1009258 --- /dev/null +++ b/ansible/roles/java/files/java.sh @@ -0,0 +1,3 @@ +# Initialization script for Java +JAVA_HOME="/usr/java/default" +export JAVA_HOME diff --git a/ansible/roles/java/tasks/main.yml b/ansible/roles/java/tasks/main.yml new file mode 100644 index 0000000..0ea7e1c --- /dev/null +++ b/ansible/roles/java/tasks/main.yml @@ -0,0 +1,54 @@ +--- +# file: roles/common/tasks/main.yml + +- name: apt install java + apt: + name=default-jdk + state=present + # update-cache=yes + tags: java + when: ansible_os_family == "Debian" + +- name: yum install java + yum: + name=jdk-{{ java_version }} + state=present + tags: java + when: ansible_os_family == "RedHat" + +- name: java system environment configuration + copy: + src=java.sh + dest=/etc/profile.d/java.sh + owner=0 + group=0 + mode=0755 + tags: java + +- name: Set JAVA_HOME ansible fact + set_fact: + java_home=/usr/java/default + tags: java + +- name: Create Ansible facts.d directory + file: + state=directory + dest=/etc/ansible/facts.d + owner=0 + group=0 + mode=0755 + tags: java + +- name: Install java facts + template: + src=facts.j2 + dest=/etc/ansible/facts.d/java.fact + owner=0 + group=0 + mode=0644 + tags: java + +- name: Re-read facts + setup: + filter=ansible_local + tags: java diff --git a/ansible/roles/java/templates/facts.j2 b/ansible/roles/java/templates/facts.j2 new file mode 100644 index 0000000..9414592 --- /dev/null +++ b/ansible/roles/java/templates/facts.j2 @@ -0,0 +1,2 @@ +[general] +java_home={{ java_home }} diff --git a/ansible/roles/kafka/defaults/main.yml b/ansible/roles/kafka/defaults/main.yml new file mode 100644 index 0000000..0512ab8 --- /dev/null +++ b/ansible/roles/kafka/defaults/main.yml @@ -0,0 +1,20 @@ +--- + +kafka_version: 0.8.2.2 + +kafka_install_dir: /opt/kafka +kafka_config_dir: /opt/kafka/default/config +kafka_log_dir: /opt/kafka/default/logs +kafka_data_log_dir: + - /opt/kafka/topic-logs + +kafka_port: 9092 +kafka_message_max: 10000000 +kafka_replica_fetch_max_bytes: 15000000 +kafka_consumer_message_max: 16777216 +kafka_num_partitions: "{{ groups['kafka'] | length }}" +kafka_replication_factor: "{{ groups['kafka'] | length }}" +kafka_log_retention_hours: 168 +kafka_num_io_threads: 8 + +kafka_source: "http://www.carfab.com/apachesoftware/kafka" \ No newline at end of file diff --git a/ansible/roles/kafka/handlers/main.yml b/ansible/roles/kafka/handlers/main.yml new file mode 100644 index 0000000..ff2d508 --- /dev/null +++ b/ansible/roles/kafka/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: restart kafka + supervisorctl: + name=kafka + state=restarted diff --git a/ansible/roles/kafka/meta/main.yml b/ansible/roles/kafka/meta/main.yml new file mode 100644 index 0000000..f5b7113 --- /dev/null +++ b/ansible/roles/kafka/meta/main.yml @@ -0,0 +1,4 @@ +--- +dependencies: + - { role: supervisord } + - { role: java } diff --git a/ansible/roles/kafka/tasks/main.yml b/ansible/roles/kafka/tasks/main.yml new file mode 100644 index 0000000..c00f877 --- /dev/null +++ b/ansible/roles/kafka/tasks/main.yml @@ -0,0 +1,90 @@ +--- + +- name: create kafka directories + file: + path={{ item }} + state=directory + mode=0744 + with_items: + - "{{ kafka_install_dir }}" + - "{{ kafka_data_log_dir }}" + tags: kafka + +- name: check for existing install + stat: path={{ kafka_install_dir }}/kafka_2.9.2-{{ kafka_version }} + register: kafka + tags: kafka + +- name: download kafka + get_url: + url="{{ kafka_source }}/{{ kafka_version }}/kafka_2.9.2-{{ kafka_version }}.tgz" + dest=/tmp/kafka_2.9.2-{{ kafka_version }}.tgz + mode=0644 + validate_certs=no + when: kafka.stat.isdir is not defined + tags: kafka + +- name: extract kafka + unarchive: + src=/tmp/kafka_2.9.2-{{ kafka_version }}.tgz + dest={{ kafka_install_dir }} + copy=no + when: kafka.stat.isdir is not defined + tags: kafka + +- name: delete temporary kafka file + file: + path=/tmp/kafka_2.9.2-{{ kafka_version }}.tgz + state=absent + ignore_errors: yes + tags: kafka + +- name: create kafka symlink + file: + path={{ kafka_install_dir }}/default + state=link + src={{ kafka_install_dir }}/kafka_2.9.2-{{ kafka_version }} + tags: kafka + +- name: configure kafka brokers + template: + src=server.properties.j2 + dest={{ kafka_config_dir }}/server.properties + mode=0644 + notify: + - restart kafka + tags: kafka + +- name: configure log4j + template: + src=log4j.properties.j2 + dest={{ kafka_config_dir }}/log4j.properties + mode=0644 + notify: + - restart kafka + tags: kafka + +- name: configure kafka consumer + template: + src=consumer.properties.j2 + dest={{ kafka_config_dir }}/consumer.properties + mode=0644 + notify: + - restart kafka + tags: kafka + +- name: copy supervisord config + template: + src=kafka-supervisord.conf.j2 + dest={{ supervisord_programs_dir }}/kafka-supervisord.conf + mode=0644 + notify: + - reread supervisord + tags: kafka + +- name: set up aliases + lineinfile: dest="/root/.bashrc" line="export KAFKA={{ kafka_install_dir }}/default" + lineinfile: dest="/root/.bashrc" line="export PATH={{ kafka_install_dir }}/default/bin:$PATH" + tags: env + +- cron: name="clear old kafka app logs" job="find /opt/kafka/default/logs -mtime +7 -exec rm {} \; > /dev/null" minute="0" diff --git a/ansible/roles/kafka/templates/consumer.properties.j2 b/ansible/roles/kafka/templates/consumer.properties.j2 new file mode 100644 index 0000000..10b7bf4 --- /dev/null +++ b/ansible/roles/kafka/templates/consumer.properties.j2 @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# see kafka.consumer.ConsumerConfig for more details + +# Zookeeper connection string +# comma separated host:port pairs, each corresponding to a zk +# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002" +zookeeper.connect=127.0.0.1:2181 + +# timeout in ms for connecting to zookeeper +zookeeper.connection.timeout.ms=6000 + +#consumer group id +group.id=test-consumer-group + +#consumer timeout +#consumer.timeout.ms=5000 + +# Need to increase this to play nice with message.max.bytes = 10000000 +fetch.message.max.bytes={{ kafka_consumer_message_max }} diff --git a/ansible/roles/kafka/templates/kafka-supervisord.conf.j2 b/ansible/roles/kafka/templates/kafka-supervisord.conf.j2 new file mode 100644 index 0000000..7121e12 --- /dev/null +++ b/ansible/roles/kafka/templates/kafka-supervisord.conf.j2 @@ -0,0 +1,6 @@ +[program:kafka] +command={{ kafka_install_dir }}/default/bin/kafka-server-start.sh {{ kafka_config_dir }}/server.properties +autostart=true +autorestart=true +startsecs=5 +stopsignal=KILL diff --git a/ansible/roles/kafka/templates/log4j.properties.j2 b/ansible/roles/kafka/templates/log4j.properties.j2 new file mode 100644 index 0000000..c0dbd1b --- /dev/null +++ b/ansible/roles/kafka/templates/log4j.properties.j2 @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kafka.logs.dir={{ kafka_log_dir }} + +log4j.rootLogger=INFO, stdout + +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n + +log4j.appender.kafkaAppender=org.apache.log4j.RollingFileAppender +#log4j.appender.kafkaAppender.DatePattern='.'yyyy-MM-dd-HH +log4j.appender.kafkaAppender.File=${kafka.logs.dir}/server.log +log4j.appender.kafkaAppender.layout=org.apache.log4j.PatternLayout +log4j.appender.kafkaAppender.layout.ConversionPattern=[%d] %p %m (%c)%n +log4j.appender.kafkaAppender.MaxFileSize=100MB +log4j.appender.kafkaAppender.MaxBackupIndex=5 +log4j.appender.kafkaAppender.RollingStyle=1 +#log4j.appender.kafkaAppender.MaxSizeRollBackups=5 + +log4j.appender.stateChangeAppender=org.apache.log4j.DailyRollingFileAppender +log4j.appender.stateChangeAppender.DatePattern='.'yyyy-MM-dd-HH +log4j.appender.stateChangeAppender.File=${kafka.logs.dir}/state-change.log +log4j.appender.stateChangeAppender.layout=org.apache.log4j.PatternLayout +log4j.appender.stateChangeAppender.layout.ConversionPattern=[%d] %p %m (%c)%n + +log4j.appender.requestAppender=org.apache.log4j.DailyRollingFileAppender +log4j.appender.requestAppender.DatePattern='.'yyyy-MM-dd-HH +log4j.appender.requestAppender.File=${kafka.logs.dir}/kafka-request.log +log4j.appender.requestAppender.layout=org.apache.log4j.PatternLayout +log4j.appender.requestAppender.layout.ConversionPattern=[%d] %p %m (%c)%n + +log4j.appender.cleanerAppender=org.apache.log4j.DailyRollingFileAppender +log4j.appender.cleanerAppender.DatePattern='.'yyyy-MM-dd-HH +log4j.appender.cleanerAppender.File=log-cleaner.log +log4j.appender.cleanerAppender.layout=org.apache.log4j.PatternLayout +log4j.appender.cleanerAppender.layout.ConversionPattern=[%d] %p %m (%c)%n + +log4j.appender.controllerAppender=org.apache.log4j.DailyRollingFileAppender +#log4j.appender.controllerAppender.DatePattern='.'yyyy-MM-dd-HH +log4j.appender.controllerAppender.File=${kafka.logs.dir}/controller.log +log4j.appender.controllerAppender.layout=org.apache.log4j.PatternLayout +log4j.appender.controllerAppender.layout.ConversionPattern=[%d] %p %m (%c)%n + +# Turn on all our debugging info +#log4j.logger.kafka.producer.async.DefaultEventHandler=DEBUG, kafkaAppender +#log4j.logger.kafka.client.ClientUtils=DEBUG, kafkaAppender +#log4j.logger.kafka.perf=DEBUG, kafkaAppender +#log4j.logger.kafka.perf.ProducerPerformance$ProducerThread=DEBUG, kafkaAppender +#log4j.logger.org.I0Itec.zkclient.ZkClient=DEBUG +log4j.logger.kafka=INFO, kafkaAppender + +log4j.logger.kafka.network.RequestChannel$=WARN, requestAppender +log4j.additivity.kafka.network.RequestChannel$=false + +#log4j.logger.kafka.network.Processor=TRACE, requestAppender +#log4j.logger.kafka.server.KafkaApis=TRACE, requestAppender +#log4j.additivity.kafka.server.KafkaApis=false +log4j.logger.kafka.request.logger=WARN, requestAppender +log4j.additivity.kafka.request.logger=false + +log4j.logger.kafka.controller=TRACE, controllerAppender +log4j.additivity.kafka.controller=false + +log4j.logger.kafka.log.LogCleaner=INFO, cleanerAppender +log4j.additivity.kafka.log.LogCleaner=false + +log4j.logger.state.change.logger=TRACE, stateChangeAppender +log4j.additivity.state.change.logger=false diff --git a/ansible/roles/kafka/templates/server.properties.j2 b/ansible/roles/kafka/templates/server.properties.j2 new file mode 100644 index 0000000..7a88611 --- /dev/null +++ b/ansible/roles/kafka/templates/server.properties.j2 @@ -0,0 +1,133 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# see kafka.server.KafkaConfig for additional details and defaults + +############################# Server Basics ############################# + +# The id of the broker. This must be set to a unique integer for each broker. +{% for host in kafka_host_list %} +{%- if host == inventory_hostname -%}broker.id={{ loop.index }}{%- endif -%} +{% endfor %} + +# The maximum message size the broker can receive (10mb as of 8/28/14) +message.max.bytes={{ kafka_message_max }} +replica.fetch.max.bytes={{ kafka_replica_fetch_max_bytes }} + +# Can delete topics in Kafka 0.8.2.0 +delete.topic.enable=true + +############################# Socket Server Settings ############################# + +# The port the socket server listens on +port={{ kafka_port }} + +# Hostname the broker will bind to. If not set, the server will bind to all interfaces +# host.name={{ inventory_hostname }} + +# Hostname the broker will advertise to producers and consumers. If not set, it uses the +# value for "host.name" if configured. Otherwise, it will use the value returned from +# java.net.InetAddress.getCanonicalHostName(). +# advertised.host.name={{ inventory_hostname }} + +# The port to publish to ZooKeeper for clients to use. If this is not set, +# it will publish the same port that the broker binds to. +advertised.port={{ kafka_port }} + +# The number of threads handling network requests +# Apache Docs recommend setting to this 8 in production +num.network.threads=8 + +# The number of threads doing disk I/O +num.io.threads={{ kafka_num_io_threads }} + +# The send buffer (SO_SNDBUF) used by the socket server +socket.send.buffer.bytes=1048576 + +# The receive buffer (SO_RCVBUF) used by the socket server +socket.receive.buffer.bytes=1048576 + +# The maximum size of a request that the socket server will accept (protection against OOM) +socket.request.max.bytes=104857600 + + +############################# Log Basics ############################# + +# A comma seperated list of directories under which to store log files +log.dirs={% for dir in kafka_data_log_dir %}{{dir}},{% endfor %} + +# The default number of log partitions per topic. More partitions allow greater +# parallelism for consumption, but this will also result in more files across +# the brokers. +num.partitions={{ kafka_num_partitions }} + +# Add redundancy across brokers +default.replication.factor={{ kafka_replication_factor }} + +# Increase IO between replication brokers +num.replica.fetchers=4 + +############################# Log Flush Policy ############################# + +# Messages are immediately written to the filesystem but by default we only fsync() to sync +# the OS cache lazily. The following configurations control the flush of data to disk. +# There are a few important trade-offs here: +# 1. Durability: Unflushed data may be lost if you are not using replication. +# 2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush. +# 3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. +# The settings below allow one to configure the flush policy to flush data after a period of time or +# every N messages (or both). This can be done globally and overridden on a per-topic basis. + +# The number of messages to accept before forcing a flush of data to disk +#log.flush.interval.messages=10000 + +# The maximum amount of time a message can sit in a log before we force a flush +#log.flush.interval.ms=1000 + +############################# Log Retention Policy ############################# + +# The following configurations control the disposal of log segments. The policy can +# be set to delete segments after a period of time, or after a given size has accumulated. +# A segment will be deleted whenever *either* of these criteria are met. Deletion always happens +# from the end of the log. + +# The minimum age of a log file to be eligible for deletion +log.retention.hours={{ kafka_log_retention_hours }} + +# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining +# segments don't drop below log.retention.bytes. +#log.retention.bytes=1073741824 + +# The maximum size of a log segment file. When this size is reached a new log segment will be created. +log.segment.bytes=536870912 + +# The interval at which log segments are checked to see if they can be deleted according +# to the retention policies +log.retention.check.interval.ms=60000 + +# By default the log cleaner is disabled and the log retention policy will default to just delete segments after their retention expires. +# If log.cleaner.enable=true is set the cleaner will be enabled and individual logs can then be marked for log compaction. +log.cleaner.enable=false + +############################# Zookeeper ############################# + +# Zookeeper connection string (see zookeeper docs for details). +# This is a comma separated host:port pairs, each corresponding to a zk +# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002". +# You can also append an optional chroot string to the urls to specify the +# root directory for all kafka znodes. +zookeeper.connect={% for host in zookeeper_host_list %}{{ host }}:{{ zookeeper_client_port|default(2181) }}{% if not loop.last %},{% endif %}{% endfor %} + +# Timeout in ms for connecting to zookeeper +zookeeper.connection.timeout.ms=1000000 diff --git a/ansible/roles/miniconda/defaults/main.yml b/ansible/roles/miniconda/defaults/main.yml new file mode 100644 index 0000000..c1ed527 --- /dev/null +++ b/ansible/roles/miniconda/defaults/main.yml @@ -0,0 +1,6 @@ +--- + +miniconda_install_dir: /opt/miniconda +user_home: /root + +miniconda_source: "https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" \ No newline at end of file diff --git a/ansible/roles/miniconda/tasks/main.yml b/ansible/roles/miniconda/tasks/main.yml new file mode 100644 index 0000000..63cbce6 --- /dev/null +++ b/ansible/roles/miniconda/tasks/main.yml @@ -0,0 +1,45 @@ +--- +- name: Check if Miniconda Environment is set up already + stat: path={{ miniconda_install_dir }} + register: minicondaCheck + tags: miniconda + +- name: Download miniconda + get_url: + url="{{ miniconda_source }}" + dest=/tmp/Miniconda.sh + mode=0755 + validate_certs=no + when: minicondaCheck.stat.exists == false + tags: miniconda + +- name: Setup Miniconda + when: minicondaCheck.stat.exists == false + command: "/tmp/Miniconda.sh -b -p {{ miniconda_install_dir }}" + tags: miniconda + +- name: Install miniconda pip + command: "/opt/miniconda/bin/conda install pip --yes" + tags: + - miniconda + - pip + +- name: prepend miniconda to path for root + lineinfile: + dest=/root/.bashrc + insertafter=EOF + line={{item}} + state=present + with_items: + - PATH={{ miniconda_install_dir }}/bin:$PATH + tags: miniconda + +- name: prepend miniconda to path for vagrant + lineinfile: + dest=/home/vagrant/.bashrc + insertafter=EOF + line={{item}} + state=present + with_items: + - PATH={{ miniconda_install_dir }}/bin:$PATH + tags: miniconda \ No newline at end of file diff --git a/ansible/roles/pip/tasks/main.yml b/ansible/roles/pip/tasks/main.yml new file mode 100644 index 0000000..0da6cce --- /dev/null +++ b/ansible/roles/pip/tasks/main.yml @@ -0,0 +1,17 @@ +--- +# file: roles/pip/tasks/main.yml + +- name: yum install python-pip + yum: + name=python-pip + state=present + tags: pip + when: ansible_os_family == "RedHat" + +- name: apt install python-pip + apt: + name=python-pip + state=present + # update-cache=yes + tags: pip + when: ansible_os_family == "Debian" \ No newline at end of file diff --git a/ansible/roles/redis/LICENSE b/ansible/roles/redis/LICENSE new file mode 100644 index 0000000..21bd84c --- /dev/null +++ b/ansible/roles/redis/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 David Wittman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/ansible/roles/redis/README.md b/ansible/roles/redis/README.md new file mode 100644 index 0000000..17837cf --- /dev/null +++ b/ansible/roles/redis/README.md @@ -0,0 +1,249 @@ +# Dec. 3, 1024 +# Original scripts modified by andrew.carter@soteradefense.com to remove Debian support, and be managed by supervisord. + + +# ansible-redis + +[![Build Status](https://travis-ci.org/DavidWittman/ansible-redis.svg?branch=master)](https://travis-ci.org/DavidWittman/ansible-redis) + + - Requires Ansible 1.6.3+ + - Compatible with most versions of Ubuntu/Debian and RHEL/CentOS 6.x + +## Installation + +``` bash +$ ansible-galaxy install DavidWittman.redis +``` + +## Getting started + +### Single Redis node + +Deploying a single Redis server node is pretty trivial; just add the role to your playbook and go. Here's an example which we'll make a little more exciting by setting the bind address to 127.0.0.1: + +``` yml +--- +- hosts: redis01.example.com + vars: + - redis_bind: 127.0.0.1 + roles: + - redis +``` + +``` bash +$ ansible-playbook -i redis01.example.com, redis.yml +``` + +**Note:** You may have noticed above that I just passed a hostname in as the Ansible inventory file. This is an easy way to run Ansible without first having to create an inventory file, you just need to suffix the hostname with a comma so Ansible knows what to do with it. + +That's it! You'll now have a Redis server listening on 127.0.0.1 on redis01.example.com. By default, the Redis binaries are installed under /opt/redis, though this can be overridden by setting the `redis_install_dir` variable. + +### Master-Slave replication + +Configuring [replication](http://redis.io/topics/replication) in Redis is accomplished by deploying multiple nodes, and setting the `redis_slaveof` variable on the slave nodes, just as you would in the redis.conf. In the example that follows, we'll deploy a Redis master with three slaves. + +In this example, we're going to use groups to separate the master and slave nodes. Let's start with the inventory file: + +``` ini +[redis-master] +redis-master.example.com + +[redis-slave] +redis-slave0[1:3].example.com +``` + +And here's the playbook: + +``` yml +--- +- name: configure the master redis server + hosts: redis-master + roles: + - redis + +- name: configure redis slaves + hosts: redis-slave + vars: + - redis_slaveof: redis-master.example.com 6379 + roles: + - redis +``` + +In this case, I'm assuming you have DNS records set up for redis-master.example.com, but that's not always the case. You can pretty much go crazy with whatever you need this to be set to. In many cases, I tell Ansible to use the eth1 IP address for the master. Here's a more flexible value for the sake of posterity: + +``` yml +redis_slaveof: "{{ hostvars['redis-master.example.com'].ansible_eth1.ipv4.address }} {{ redis_port }}" +``` + +Now you're cooking with gas! Running this playbook should have you ready to go with a Redis master and three slaves. + +### Redis Sentinel + +#### Introduction + +Using Master-Slave replication is great for durability and distributing reads and writes, but not so much for high availability. If the master node fails, a slave must be manually promoted to master, and connections will need to be redirected to the new master. The solution for this problem is [Redis Sentinel](http://redis.io/topics/sentinel), a distributed system which uses Redis itself to communicate and handle automatic failover in a Redis cluster. + +Sentinel itself uses the same redis-server binary that Redis uses, but runs with the `--sentinel` flag and with a different configuration file. All of this, of course, is abstracted with this Ansible role, but it's still good to know. + +#### Configuration + +To add a Sentinel node to an existing deployment, assign this same `redis` role to it, and set the variable `redis_sentinel` to True on that particular host. This can be done in any number of ways, and for the purposes of this example I'll extend on the inventory file used above in the Master/Slave configuration: + +``` ini +[redis-master] +redis-master.example.com + +[redis-slave] +redis-slave0[1:3].example.com + +[redis-sentinel] +redis-sentinel0[1:3].example.com redis_sentinel=True +``` + +Above, we've added three more hosts in the **redis-sentinel** group (though this group serves no purpose within the role, it's merely an identifier), and set the `redis_sentinel` variable inline within the inventory file. + +Now, all we need to do is set the `redis_sentinel_monitors` variable to define the Redis masters which Sentinel should monitor. In this case, I'm going to do this within the playbook: + +``` yml +- name: configure the master redis server + hosts: redis-master + roles: + - redis + +- name: configure redis slaves + hosts: redis-slave + vars: + - redis_slaveof: redis-master.example.com 6379 + roles: + - redis + +- name: configure redis sentinel nodes + hosts: redis-sentinel + vars: + - redis_sentinel_monitors: + - name: master01 + host: redis-master.example.com + port: 6379 + roles: + - redis +``` + +This will configure the Sentinel nodes to monitor the master we created above using the identifier `master01`. By default, Sentinel will use a quorum of 2, which means that at least 2 Sentinels must agree that a master is down in order for a failover to take place. This value can be overridden by setting the `quorum` key within your monitor definition. See the [Sentinel docs](http://redis.io/topics/sentinel) for more details. + +Along with the variables listed above, Sentinel has a number of its own configurables just as Redis server does. These are prefixed with `redis_sentinel_`, and are enumerated in the **Configurables** section below. + + +## Installing redis from a source file in the ansible role + +If the environment your server resides in does not allow downloads (i.e. if the machine is sitting in a dmz) set the variable `redis_tarball` to the path of a locally downloaded tar.gz file to prevent a http download from redis.io. +Do not forget to set the version variable to the same version of the tar.gz. to avoid confusion ! + +For example (file was stored in same folder as the playbook that included the redis role): +```yml +vars: + - redis_version: 2.8.14 + - redis_tarball: redis-2.8.14.tar.gz +``` +In this case the source archive is copied towards the server over ssh rather than downloaded. + + + +## Configurables + +Here is a list of all the default variables for this role, which are also available in defaults/main.yml. One of these days I'll format these into a table or something. + +``` yml +--- +## Installation options +redis_version: 2.8.8 +redis_install_dir: /opt/redis +redis_user: redis +# Working directory for Redis. RDB and AOF files will be written here. +redis_dir: /var/lib/redis/{{ redis_port }} +redis_tarball: false +# The open file limit for Redis/Sentinel +redis_nofile_limit: 16384 +# Configure Redis as a service +# When set to false, this role will not create init scripts or manage +# the Redis/Sentinel processes. +# This is usually needed when a tool like Supervisor will manage the process. +redis_as_service: true + +## Networking/connection options +redis_bind: 0.0.0.0 +redis_port: 6379 +redis_password: false +redis_tcp_backlog: 511 +redis_tcp_keepalive: 0 +# Max connected clients at a time +redis_maxclients: 10000 +redis_timeout: 0 + +## Replication options +# Set slaveof just as you would in redis.conf. (e.g. "redis01 6379") +redis_slaveof: false +# Make slaves read-only. "yes" or "no" +redis_slave_read_only: "yes" +redis_slave_priority: 100 +redis_repl_backlog_size: false + +## Logging +redis_logfile: '""' +# Enable syslog. "yes" or "no" +redis_syslog_enabled: "yes" +redis_syslog_ident: redis_{{ redis_port }} +# Syslog facility. Must be USER or LOCAL0-LOCAL7 +redis_syslog_facility: USER + +## General configuration +redis_daemonize: "yes" +redis_pidfile: /var/run/redis/{{ redis_port }}.pid +# Number of databases to allow +redis_databases: 16 +redis_loglevel: notice +# Log queries slower than this many milliseconds. -1 to disable +redis_slowlog_log_slower_than: 10000 +# Maximum number of slow queries to save +redis_slowlog_max_len: 128 +# Redis memory limit (e.g. 4294967296, 4096mb, 4gb) +redis_maxmemory: false +redis_maxmemory_policy: noeviction +redis_rename_commands: [] +# How frequently to snapshot the database to disk +# e.g. "900 1" => 900 seconds if at least 1 key changed +redis_save: + - 900 1 + - 300 10 + - 60 10000 + +## Redis sentinel configs +# Set this to true on a host to configure it as a Sentinel +redis_sentinel: false +redis_sentinel_dir: /var/lib/redis/sentinel_{{ redis_sentinel_port }} +redis_sentinel_bind: 0.0.0.0 +redis_sentinel_port: 26379 +redis_sentinel_pidfile: /var/run/redis/sentinel_{{ redis_sentinel_port }}.pid +redis_sentinel_logfile: '""' +redis_sentinel_syslog_ident: sentinel_{{ redis_sentinel_port }} +redis_sentinel_monitors: + - name: master01 + host: localhost + port: 6379 + quorum: 2 + auth_pass: ant1r3z + down_after_milliseconds: 30000 + parallel_syncs: 1 + failover_timeout: 180000 + notification_script: false + client_reconfig_script: false +``` + +## Facts + +The following facts are accessible in your inventory or tasks outside of this role. + +- `{{ ansible_local.redis.bind }}` +- `{{ ansible_local.redis.port }}` +- `{{ ansible_local.redis.sentinel_bind }}` +- `{{ ansible_local.redis.sentinel_port }}` +- `{{ ansible_local.redis.sentinel_monitors }}` diff --git a/ansible/roles/redis/defaults/main.yml b/ansible/roles/redis/defaults/main.yml new file mode 100644 index 0000000..052f61a --- /dev/null +++ b/ansible/roles/redis/defaults/main.yml @@ -0,0 +1,92 @@ +--- +## Installation options +redis_version: 3.0.5 +redis_install_dir: /opt/redis +redis_user: redis +redis_dir: /var/lib/redis/{{ redis_port }} +# The open file limit for Redis/Sentinel +redis_nofile_limit: 16384 +# Configure Redis as a service +# This creates the init scripts for Redis and ensures the process is running +# Also applies for Redis Sentinel +redis_as_service: true + +## Networking/connection options +redis_bind: 0.0.0.0 +redis_port: 6379 +redis_password: false +redis_tcp_backlog: 511 +redis_tcp_keepalive: 0 +# Max connected clients at a time +redis_maxclients: 10000 +redis_timeout: 0 +# Socket options +# Set socket_path to the desired path to the socket. E.g. /var/run/redis/{{ redis_port }}.sock +redis_socket_path: false +redis_socket_perm: 755 + +## Replication options +# Set slaveof just as you would in redis.conf. (e.g. "redis01 6379") +redis_slaveof: false +# Make slaves read-only. "yes" or "no" +redis_slave_read_only: "yes" +redis_slave_priority: 100 +redis_repl_backlog_size: false + +## Logging +redis_logfile: '""' +# Enable syslog. "yes" or "no" +redis_syslog_enabled: "yes" +redis_syslog_ident: redis_{{ redis_port }} +# Syslog facility. Must be USER or LOCAL0-LOCAL7 +redis_syslog_facility: USER + +## General configuration +redis_daemonize: "no" +redis_pidfile: /var/run/redis/{{ redis_port }}.pid +# Number of databases to allow +redis_databases: 16 +redis_loglevel: notice +# Log queries slower than this many milliseconds. -1 to disable +redis_slowlog_log_slower_than: 10000 +# Maximum number of slow queries to save +redis_slowlog_max_len: 128 +# Redis memory limit (e.g. 4294967296, 4096mb, 4gb) +redis_maxmemory: false +redis_maxmemory_policy: noeviction +redis_rename_commands: [] +# How frequently to snapshot the database to disk +# e.g. "900 1" => 900 seconds if at least 1 key changed +redis_save: + - 900 1 + - 300 10 + - 60 10000 +redis_appendonly: "no" +redis_appendfilename: "appendonly.aof" +redis_appendfsync: "everysec" +redis_no_appendfsync_on_rewrite: "no" +redis_auto_aof_rewrite_percentage: "100" +redis_auto_aof_rewrite_min_size: "64mb" + +## Redis sentinel configs +# Set this to true on a host to configure it as a Sentinel +redis_sentinel: false +redis_sentinel_dir: /var/lib/redis/sentinel_{{ redis_sentinel_port }} +redis_sentinel_bind: 0.0.0.0 +redis_sentinel_port: 26379 +redis_sentinel_pidfile: /var/run/redis/sentinel_{{ redis_sentinel_port }}.pid +redis_sentinel_logfile: '""' +redis_sentinel_syslog_ident: sentinel_{{ redis_sentinel_port }} +redis_sentinel_monitors: + - name: master01 + host: localhost + port: 6379 + quorum: 2 + auth_pass: ant1r3z + down_after_milliseconds: 30000 + parallel_syncs: 1 + failover_timeout: 180000 + notification_script: false + client_reconfig_script: false + +redis_source: "http://download.redis.io/releases" diff --git a/ansible/roles/redis/handlers/main.yml b/ansible/roles/redis/handlers/main.yml new file mode 100644 index 0000000..f7dfc4c --- /dev/null +++ b/ansible/roles/redis/handlers/main.yml @@ -0,0 +1,20 @@ +--- +- name: restart redis + supervisorctl: + name=redis + state=restarted + +- name: wait for redis port + wait_for: + port={{ redis_port }} + state=started + +- name: restart sentinel + supervisorctl: + name=redis-sentinel + state=restarted + +- name: wait for sentinel port + wait_for: + port={{ redis_sentinel_port }} + state=started diff --git a/ansible/roles/redis/meta/main.yml b/ansible/roles/redis/meta/main.yml new file mode 100644 index 0000000..7b84a49 --- /dev/null +++ b/ansible/roles/redis/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - { role: supervisord } diff --git a/ansible/roles/redis/tasks/install.yml b/ansible/roles/redis/tasks/install.yml new file mode 100644 index 0000000..0cf94e7 --- /dev/null +++ b/ansible/roles/redis/tasks/install.yml @@ -0,0 +1,76 @@ +--- +- name: install dependencies + yum: + name={{ item }} + state=present + with_items: + - gcc + - make + tags: redis + when: ansible_os_family == "RedHat" + +- name: create redis directories + file: + path={{ item }} + state=directory + with_items: + - "{{ redis_install_dir }}" + - /etc/redis + - /var/log/redis/ + tags: redis + +- name: check for existing install + stat: path={{ redis_install_dir }}/redis-{{ redis_version }} + register: redis + tags: redis + +- name: add redis user + user: + name={{ redis_user }} + comment="Redis" + home={{ redis_install_dir }} + shell=/bin/false + system=yes + tags: redis + +- name: download redis + get_url: + url={{ redis_source }}/redis-{{ redis_version }}.tar.gz + dest=/usr/local/src/redis-{{ redis_version }}.tar.gz + validate_certs=no + when: redis.stat.isdir is not defined + tags: redis + +- name: extract redis tarball + shell: tar xf /usr/local/src/redis-{{ redis_version }}.tar.gz -C /usr/local/src + creates=/usr/local/src/redis-{{ redis_version }} + when: redis.stat.isdir is not defined + tags: redis + +- name: compile redis + command: make -j5 + chdir=/usr/local/src/redis-{{ redis_version }} + creates=/usr/local/src/redis-{{ redis_version }}/src/redis-server + when: redis.stat.isdir is not defined + tags: redis + +- name: create /var/run/redis + file: + path=/var/run/redis + state=directory + owner={{ redis_user }} + tags: redis + +- name: install redis + command: make PREFIX={{ redis_install_dir }}/redis-{{ redis_version }} install clean + chdir=/usr/local/src/redis-{{ redis_version }} + creates={{ redis_install_dir }}/redis-{{ redis_version }}/bin/redis-server + when: redis.stat.isdir is not defined + tags: redis + +- name: create redis symlink + file: + path={{ redis_install_dir }}/default + state=link + src={{ redis_install_dir }}/redis-{{ redis_version }} + tags: redis diff --git a/ansible/roles/redis/tasks/local_facts.yml b/ansible/roles/redis/tasks/local_facts.yml new file mode 100644 index 0000000..4879027 --- /dev/null +++ b/ansible/roles/redis/tasks/local_facts.yml @@ -0,0 +1,5 @@ +- name: create facts directory + file: path='/etc/ansible/facts.d' state='directory' + +- name: create redis facts + template: src='etc/ansible/facts.d/redis.fact.j2' dest='/etc/ansible/facts.d/redis.fact' \ No newline at end of file diff --git a/ansible/roles/redis/tasks/main.yml b/ansible/roles/redis/tasks/main.yml new file mode 100644 index 0000000..af0c12c --- /dev/null +++ b/ansible/roles/redis/tasks/main.yml @@ -0,0 +1,11 @@ +--- +- include: install.yml +- include: server.yml + when: not redis_sentinel + tags: + - config +- include: sentinel.yml + when: redis_sentinel + tags: + - config +- include: local_facts.yml \ No newline at end of file diff --git a/ansible/roles/redis/tasks/sentinel.yml b/ansible/roles/redis/tasks/sentinel.yml new file mode 100644 index 0000000..70da5fa --- /dev/null +++ b/ansible/roles/redis/tasks/sentinel.yml @@ -0,0 +1,43 @@ +--- +- name: create sentinel working directory + file: + path={{ redis_sentinel_dir }} + state=directory + recurse=yes + owner={{ redis_user }} + tags: redis + +- name: check if sentinel log file exists + stat: + path={{ redis_sentinel_logfile }} + register: sentinel_logfile_stat + tags: redis + +- name: ensure that sentinel log file exists and is writable by redis + file: + path={{ redis_sentinel_logfile }} + owner={{ redis_user }} + group={{ redis_user }} + mode=0600 + state=touch + when: sentinel_logfile_stat.stat.exists == False and redis_sentinel_logfile != '""' + tags: redis + +- name: create sentinel config file + template: + src=redis-sentinel.conf.j2 + dest=/etc/redis/sentinel.conf + owner={{ redis_user }} + notify: + - restart sentinel + tags: redis + +- name: copy supervisord config + template: + src=redis-sentinel-supervisord.conf.j2 + dest={{ supervisord_programs_dir }}/redis-sentinel-supervisord.conf + mode=0644 + notify: + - reread supervisord + tags: redis + diff --git a/ansible/roles/redis/tasks/server.yml b/ansible/roles/redis/tasks/server.yml new file mode 100644 index 0000000..43ef451 --- /dev/null +++ b/ansible/roles/redis/tasks/server.yml @@ -0,0 +1,42 @@ +--- +- name: create redis working directory + file: + path={{ redis_dir }} + state=directory + recurse=yes + owner={{ redis_user }} + tags: redis + +- name: check if log file exists + stat: + path={{ redis_logfile }} + register: logfile_stat + tags: redis + +- name: ensure that log file exists and is writable by redis + file: + path={{ redis_logfile }} + owner={{ redis_user }} + group={{ redis_user }} + mode=0600 + state=touch + when: logfile_stat.stat.exists == False and redis_logfile != '""' + tags: redis + +- name: create redis config file + template: + src=redis.conf.j2 + dest=/etc/redis/redis.conf + owner={{ redis_user }} + notify: + - restart redis + tags: redis + +- name: copy supervisord config + template: + src=redis-supervisord.conf.j2 + dest={{ supervisord_programs_dir }}/redis-supervisord.conf + mode=0644 + notify: + - reread supervisord + tags: redis diff --git a/ansible/roles/redis/templates/etc/ansible/facts.d/redis.fact.j2 b/ansible/roles/redis/templates/etc/ansible/facts.d/redis.fact.j2 new file mode 100644 index 0000000..a22aa17 --- /dev/null +++ b/ansible/roles/redis/templates/etc/ansible/facts.d/redis.fact.j2 @@ -0,0 +1,7 @@ +{ + "bind": "{{ redis_bind }}", + "port": "{{ redis_port }}", + "sentinel_bind": "{{ redis_sentinel_bind }}", + "sentinel_port": "{{ redis_sentinel_port }}", + "sentinel_monitors": {{ redis_sentinel_monitors | to_json }} +} \ No newline at end of file diff --git a/ansible/roles/redis/templates/redis-sentinel-supervisord.conf.j2 b/ansible/roles/redis/templates/redis-sentinel-supervisord.conf.j2 new file mode 100644 index 0000000..54de8fd --- /dev/null +++ b/ansible/roles/redis/templates/redis-sentinel-supervisord.conf.j2 @@ -0,0 +1,7 @@ +[program:redis-sentinel] +command={{ redis_install_dir }}/default/bin/redis-server /etc/redis/sentinel.conf --sentinel +autostart=true +autorestart=true +user={{ redis_user }} +stdout_logfile=/var/log/redis/stdout.log +stderr_logfile=/var/log/redis/stderr.log diff --git a/ansible/roles/redis/templates/redis-sentinel.conf.j2 b/ansible/roles/redis/templates/redis-sentinel.conf.j2 new file mode 100644 index 0000000..624e5b1 --- /dev/null +++ b/ansible/roles/redis/templates/redis-sentinel.conf.j2 @@ -0,0 +1,23 @@ +# redis-sentinel {{ redis_version }} configuration file +# sentinel_{{ redis_sentinel_port }}.conf + +daemonize {{ redis_daemonize }} +dir {{ redis_sentinel_dir }} +pidfile {{ redis_sentinel_pidfile }} +port {{ redis_sentinel_port }} +bind {{ redis_sentinel_bind }} + +{% for master in redis_sentinel_monitors -%} +sentinel monitor {{ master.name }} {{ master.host }} {{ master.port }} {{ master.quorum|d('2') }} +{% for option in ('auth_pass', 'down_after_milliseconds', 'parallel_syncs', 'failover_timeout', 'notification_script', 'client_reconfig_script') -%} +{% if master[option] is defined and master[option] -%} +sentinel {{ option|replace('_', '-') }} {{ master.name }} {{ master[option] }} +{% endif %} +{% endfor -%} + +{% endfor -%} + +logfile {{ redis_sentinel_logfile }} +syslog-enabled {{ redis_syslog_enabled }} +syslog-ident {{ redis_sentinel_syslog_ident }} +syslog-facility {{ redis_syslog_facility }} diff --git a/ansible/roles/redis/templates/redis-supervisord.conf.j2 b/ansible/roles/redis/templates/redis-supervisord.conf.j2 new file mode 100644 index 0000000..d5db921 --- /dev/null +++ b/ansible/roles/redis/templates/redis-supervisord.conf.j2 @@ -0,0 +1,7 @@ +[program:redis] +command={{ redis_install_dir }}/default/bin/redis-server /etc/redis/redis.conf +autostart=true +autorestart=true +user={{ redis_user }} +stdout_logfile=/var/log/redis/stdout.log +stderr_logfile=/var/log/redis/stderr.log diff --git a/ansible/roles/redis/templates/redis.conf.j2 b/ansible/roles/redis/templates/redis.conf.j2 new file mode 100644 index 0000000..6307e8d --- /dev/null +++ b/ansible/roles/redis/templates/redis.conf.j2 @@ -0,0 +1,786 @@ +# Redis {{ redis_version }} configuration file +# {{ redis_port }}.conf + +# Note on units: when memory size is needed, it is possible to specify +# it in the usual form of 1k 5GB 4M and so forth: +# +# 1k => 1000 bytes +# 1kb => 1024 bytes +# 1m => 1000000 bytes +# 1mb => 1024*1024 bytes +# 1g => 1000000000 bytes +# 1gb => 1024*1024*1024 bytes +# +# units are case insensitive so 1GB 1Gb 1gB are all the same. + +################################## INCLUDES ################################### + +# Include one or more other config files here. This is useful if you +# have a standard template that goes to all Redis server but also need +# to customize a few per-server settings. Include files can include +# other files, so use this wisely. +# +# Notice option "include" won't be rewritten by command "CONFIG REWRITE" +# from admin or Redis Sentinel. Since Redis always uses the last processed +# line as value of a configuration directive, you'd better put includes +# at the beginning of this file to avoid overwriting config change at runtime. +# +# If instead you are interested in using includes to override configuration +# options, it is better to use include as the last line. +# +# include /path/to/local.conf +# include /path/to/other.conf + +################################ GENERAL ##################################### + +# By default Redis does not run as a daemon. Use 'yes' if you need it. +# Note that Redis will write a pid file in /var/run/redis.pid when daemonized. +daemonize {{ redis_daemonize }} + +# When running daemonized, Redis writes a pid file in /var/run/redis.pid by +# default. You can specify a custom pid file location here. +pidfile {{ redis_pidfile }} + +# Accept connections on the specified port, default is 6379. +# If port 0 is specified Redis will not listen on a TCP socket. +port {{ redis_port }} + +# TCP listen() backlog. +# +# In high requests-per-second environments you need an high backlog in order +# to avoid slow clients connections issues. Note that the Linux kernel +# will silently truncate it to the value of /proc/sys/net/core/somaxconn so +# make sure to raise both the value of somaxconn and tcp_max_syn_backlog +# in order to get the desired effect. +tcp-backlog {{ redis_tcp_backlog }} + +# By default Redis listens for connections from all the network interfaces +# available on the server. It is possible to listen to just one or multiple +# interfaces using the "bind" configuration directive, followed by one or +# more IP addresses. +# +# Examples: +# +# bind 192.168.1.100 10.0.0.1 +# bind 127.0.0.1 +bind {{ redis_bind }} + +# Specify the path for the Unix socket that will be used to listen for +# incoming connections. There is no default, so Redis will not listen +# on a unix socket when not specified. +# +{% if redis_socket_path -%} +unixsocket {{ redis_socket_path }} +unixsocketperm {{ redis_socket_perm }} +{% endif -%} + +# Close the connection after a client is idle for N seconds (0 to disable) +timeout {{ redis_timeout }} + +# TCP keepalive. +# +# If non-zero, use SO_KEEPALIVE to send TCP ACKs to clients in absence +# of communication. This is useful for two reasons: +# +# 1) Detect dead peers. +# 2) Take the connection alive from the point of view of network +# equipment in the middle. +# +# On Linux, the specified value (in seconds) is the period used to send ACKs. +# Note that to close the connection the double of the time is needed. +# On other kernels the period depends on the kernel configuration. +# +# A reasonable value for this option is 60 seconds. +tcp-keepalive {{ redis_tcp_keepalive }} + +# Specify the server verbosity level. +# This can be one of: +# debug (a lot of information, useful for development/testing) +# verbose (many rarely useful info, but not a mess like the debug level) +# notice (moderately verbose, what you want in production probably) +# warning (only very important / critical messages are logged) +loglevel {{ redis_loglevel }} + +# Specify the log file name. Also the empty string can be used to force +# Redis to log on the standard output. Note that if you use standard +# output for logging but daemonize, logs will be sent to /dev/null +logfile {{ redis_logfile }} + +# To enable logging to the system logger, just set 'syslog-enabled' to yes, +# and optionally update the other syslog parameters to suit your needs. +syslog-enabled {{ redis_syslog_enabled }} + +# Specify the syslog identity. +syslog-ident {{ redis_syslog_ident }} + +# Specify the syslog facility. Must be USER or between LOCAL0-LOCAL7. +syslog-facility {{ redis_syslog_facility }} + +# Set the number of databases. The default database is DB 0, you can select +# a different one on a per-connection basis using SELECT where +# dbid is a number between 0 and 'databases'-1 +databases {{ redis_databases }} + +################################ SNAPSHOTTING ################################ +# +# Save the DB on disk: +# +# save +# +# Will save the DB if both the given number of seconds and the given +# number of write operations against the DB occurred. +# +# In the example below the behaviour will be to save: +# after 900 sec (15 min) if at least 1 key changed +# after 300 sec (5 min) if at least 10 keys changed +# after 60 sec if at least 10000 keys changed +# +# Note: you can disable saving at all commenting all the "save" lines. +# +# It is also possible to remove all the previously configured save +# points by adding a save directive with a single empty string argument +# like in the following example: +# +# save "" + +{% for save in redis_save -%} +save {{ save }} +{% endfor -%} + +# By default Redis will stop accepting writes if RDB snapshots are enabled +# (at least one save point) and the latest background save failed. +# This will make the user aware (in a hard way) that data is not persisting +# on disk properly, otherwise chances are that no one will notice and some +# disaster will happen. +# +# If the background saving process will start working again Redis will +# automatically allow writes again. +# +# However if you have setup your proper monitoring of the Redis server +# and persistence, you may want to disable this feature so that Redis will +# continue to work as usual even if there are problems with disk, +# permissions, and so forth. +stop-writes-on-bgsave-error yes + +# Compress string objects using LZF when dump .rdb databases? +# For default that's set to 'yes' as it's almost always a win. +# If you want to save some CPU in the saving child set it to 'no' but +# the dataset will likely be bigger if you have compressible values or keys. +rdbcompression yes + +# Since version 5 of RDB a CRC64 checksum is placed at the end of the file. +# This makes the format more resistant to corruption but there is a performance +# hit to pay (around 10%) when saving and loading RDB files, so you can disable it +# for maximum performances. +# +# RDB files created with checksum disabled have a checksum of zero that will +# tell the loading code to skip the check. +rdbchecksum yes + +# The filename where to dump the DB +dbfilename dump.rdb + +# The working directory. +# +# The DB will be written inside this directory, with the filename specified +# above using the 'dbfilename' configuration directive. +# +# The Append Only File will also be created inside this directory. +# +# Note that you must specify a directory here, not a file name. +dir {{ redis_dir }} + +################################# REPLICATION ################################# + +# Master-Slave replication. Use slaveof to make a Redis instance a copy of +# another Redis server. Note that the configuration is local to the slave +# so for example it is possible to configure the slave to save the DB with a +# different interval, or to listen to another port, and so on. +# +# slaveof +{% if redis_slaveof -%} +slaveof {{ redis_slaveof }} +{% endif -%} +# If the master is password protected (using the "requirepass" configuration +# directive below) it is possible to tell the slave to authenticate before +# starting the replication synchronization process, otherwise the master will +# refuse the slave request. +# +# masterauth +{% if redis_slaveof and redis_password -%} +masterauth {{ redis_password }} +{% endif -%} + +# When a slave loses its connection with the master, or when the replication +# is still in progress, the slave can act in two different ways: +# +# 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will +# still reply to client requests, possibly with out of date data, or the +# data set may just be empty if this is the first synchronization. +# +# 2) if slave-serve-stale-data is set to 'no' the slave will reply with +# an error "SYNC with master in progress" to all the kind of commands +# but to INFO and SLAVEOF. +# +slave-serve-stale-data yes + +# You can configure a slave instance to accept writes or not. Writing against +# a slave instance may be useful to store some ephemeral data (because data +# written on a slave will be easily deleted after resync with the master) but +# may also cause problems if clients are writing to it because of a +# misconfiguration. +# +# Since Redis 2.6 by default slaves are read-only. +# +# Note: read only slaves are not designed to be exposed to untrusted clients +# on the internet. It's just a protection layer against misuse of the instance. +# Still a read only slave exports by default all the administrative commands +# such as CONFIG, DEBUG, and so forth. To a limited extent you can improve +# security of read only slaves using 'rename-command' to shadow all the +# administrative / dangerous commands. +slave-read-only {{ redis_slave_read_only }} + +# Slaves send PINGs to server in a predefined interval. It's possible to change +# this interval with the repl_ping_slave_period option. The default value is 10 +# seconds. +# +# repl-ping-slave-period 10 + +# The following option sets the replication timeout for: +# +# 1) Bulk transfer I/O during SYNC, from the point of view of slave. +# 2) Master timeout from the point of view of slaves (data, pings). +# 3) Slave timeout from the point of view of masters (REPLCONF ACK pings). +# +# It is important to make sure that this value is greater than the value +# specified for repl-ping-slave-period otherwise a timeout will be detected +# every time there is low traffic between the master and the slave. +# +# repl-timeout 60 + +# Disable TCP_NODELAY on the slave socket after SYNC? +# +# If you select "yes" Redis will use a smaller number of TCP packets and +# less bandwidth to send data to slaves. But this can add a delay for +# the data to appear on the slave side, up to 40 milliseconds with +# Linux kernels using a default configuration. +# +# If you select "no" the delay for data to appear on the slave side will +# be reduced but more bandwidth will be used for replication. +# +# By default we optimize for low latency, but in very high traffic conditions +# or when the master and slaves are many hops away, turning this to "yes" may +# be a good idea. +repl-disable-tcp-nodelay no + +# Set the replication backlog size. The backlog is a buffer that accumulates +# slave data when slaves are disconnected for some time, so that when a slave +# wants to reconnect again, often a full resync is not needed, but a partial +# resync is enough, just passing the portion of data the slave missed while +# disconnected. +# +# The biggest the replication backlog, the longer the time the slave can be +# disconnected and later be able to perform a partial resynchronization. +# +# The backlog is only allocated once there is at least a slave connected. +# +# repl-backlog-size 1mb +{% if redis_repl_backlog_size -%} +repl-backlog-size {{ redis_repl_backlog_size }} +{% endif -%} + +# After a master has no longer connected slaves for some time, the backlog +# will be freed. The following option configures the amount of seconds that +# need to elapse, starting from the time the last slave disconnected, for +# the backlog buffer to be freed. +# +# A value of 0 means to never release the backlog. +# +# repl-backlog-ttl 3600 + +# The slave priority is an integer number published by Redis in the INFO output. +# It is used by Redis Sentinel in order to select a slave to promote into a +# master if the master is no longer working correctly. +# +# A slave with a low priority number is considered better for promotion, so +# for instance if there are three slaves with priority 10, 100, 25 Sentinel will +# pick the one with priority 10, that is the lowest. +# +# However a special priority of 0 marks the slave as not able to perform the +# role of master, so a slave with priority of 0 will never be selected by +# Redis Sentinel for promotion. +# +# By default the priority is 100. +slave-priority {{ redis_slave_priority }} + +# It is possible for a master to stop accepting writes if there are less than +# N slaves connected, having a lag less or equal than M seconds. +# +# The N slaves need to be in "online" state. +# +# The lag in seconds, that must be <= the specified value, is calculated from +# the last ping received from the slave, that is usually sent every second. +# +# This option does not GUARANTEES that N replicas will accept the write, but +# will limit the window of exposure for lost writes in case not enough slaves +# are available, to the specified number of seconds. +# +# For example to require at least 3 slaves with a lag <= 10 seconds use: +# +# min-slaves-to-write 3 +# min-slaves-max-lag 10 +# +# Setting one or the other to 0 disables the feature. +# +# By default min-slaves-to-write is set to 0 (feature disabled) and +# min-slaves-max-lag is set to 10. + +################################## SECURITY ################################### + +# Require clients to issue AUTH before processing any other +# commands. This might be useful in environments in which you do not trust +# others with access to the host running redis-server. +# +# This should stay commented out for backward compatibility and because most +# people do not need auth (e.g. they run their own servers). +# +# Warning: since Redis is pretty fast an outside user can try up to +# 150k passwords per second against a good box. This means that you should +# use a very strong password otherwise it will be very easy to break. +# +# requirepass foobared +{% if redis_password -%} +requirepass {{ redis_password }} +{% endif -%} + +# Command renaming. +# +# It is possible to change the name of dangerous commands in a shared +# environment. For instance the CONFIG command may be renamed into something +# hard to guess so that it will still be available for internal-use tools +# but not available for general clients. +# +# Example: +# +# rename-command CONFIG b840fc02d524045429941cc15f59e41cb7be6c52 +# +# It is also possible to completely kill a command by renaming it into +# an empty string: +# +# rename-command CONFIG "" +# +# Please note that changing the name of commands that are logged into the +# AOF file or transmitted to slaves may cause problems. +{% for command in redis_rename_commands -%} +rename-command {{ command }} +{% endfor -%} + +################################### LIMITS #################################### + +# Set the max number of connected clients at the same time. By default +# this limit is set to 10000 clients, however if the Redis server is not +# able to configure the process file limit to allow for the specified limit +# the max number of allowed clients is set to the current file limit +# minus 32 (as Redis reserves a few file descriptors for internal uses). +# +# Once the limit is reached Redis will close all the new connections sending +# an error 'max number of clients reached'. +# +maxclients {{ redis_maxclients }} + +# Don't use more memory than the specified amount of bytes. +# When the memory limit is reached Redis will try to remove keys +# according to the eviction policy selected (see maxmemory-policy). +# +# If Redis can't remove keys according to the policy, or if the policy is +# set to 'noeviction', Redis will start to reply with errors to commands +# that would use more memory, like SET, LPUSH, and so on, and will continue +# to reply to read-only commands like GET. +# +# This option is usually useful when using Redis as an LRU cache, or to set +# a hard memory limit for an instance (using the 'noeviction' policy). +# +# WARNING: If you have slaves attached to an instance with maxmemory on, +# the size of the output buffers needed to feed the slaves are subtracted +# from the used memory count, so that network problems / resyncs will +# not trigger a loop where keys are evicted, and in turn the output +# buffer of slaves is full with DELs of keys evicted triggering the deletion +# of more keys, and so forth until the database is completely emptied. +# +# In short... if you have slaves attached it is suggested that you set a lower +# limit for maxmemory so that there is some free RAM on the system for slave +# output buffers (but this is not needed if the policy is 'noeviction'). +# +# maxmemory +{% if redis_maxmemory -%} +maxmemory {{ redis_maxmemory }} +{% endif -%} + +# MAXMEMORY POLICY: how Redis will select what to remove when maxmemory +# is reached. You can select among five behaviors: +# +# volatile-lru -> remove the key with an expire set using an LRU algorithm +# allkeys-lru -> remove any key accordingly to the LRU algorithm +# volatile-random -> remove a random key with an expire set +# allkeys-random -> remove a random key, any key +# volatile-ttl -> remove the key with the nearest expire time (minor TTL) +# noeviction -> don't expire at all, just return an error on write operations +# +# Note: with any of the above policies, Redis will return an error on write +# operations, when there are not suitable keys for eviction. +# +# At the date of writing this commands are: set setnx setex append +# incr decr rpush lpush rpushx lpushx linsert lset rpoplpush sadd +# sinter sinterstore sunion sunionstore sdiff sdiffstore zadd zincrby +# zunionstore zinterstore hset hsetnx hmset hincrby incrby decrby +# getset mset msetnx exec sort +# +# The default is: +# +# maxmemory-policy noeviction +maxmemory-policy {{ redis_maxmemory_policy }} + +# LRU and minimal TTL algorithms are not precise algorithms but approximated +# algorithms (in order to save memory), so you can tune it for speed or +# accuracy. For default Redis will check five keys and pick the one that was +# used less recently, you can change the sample size using the following +# configuration directive. +# +# The default of 5 produces good enough results. 10 Approximates very closely +# true LRU but costs a bit more CPU. 3 is very fast but not very accurate. +# +# maxmemory-samples 5 + +############################## APPEND ONLY MODE ############################### + +# By default Redis asynchronously dumps the dataset on disk. This mode is +# good enough in many applications, but an issue with the Redis process or +# a power outage may result into a few minutes of writes lost (depending on +# the configured save points). +# +# The Append Only File is an alternative persistence mode that provides +# much better durability. For instance using the default data fsync policy +# (see later in the config file) Redis can lose just one second of writes in a +# dramatic event like a server power outage, or a single write if something +# wrong with the Redis process itself happens, but the operating system is +# still running correctly. +# +# AOF and RDB persistence can be enabled at the same time without problems. +# If the AOF is enabled on startup Redis will load the AOF, that is the file +# with the better durability guarantees. +# +# Please check http://redis.io/topics/persistence for more information. + +appendonly {{ redis_appendonly }} + +# The name of the append only file (default: "appendonly.aof") + +appendfilename "{{ redis_appendfilename }}" + +# The fsync() call tells the Operating System to actually write data on disk +# instead to wait for more data in the output buffer. Some OS will really flush +# data on disk, some other OS will just try to do it ASAP. +# +# Redis supports three different modes: +# +# no: don't fsync, just let the OS flush the data when it wants. Faster. +# always: fsync after every write to the append only log . Slow, Safest. +# everysec: fsync only one time every second. Compromise. +# +# The default is "everysec", as that's usually the right compromise between +# speed and data safety. It's up to you to understand if you can relax this to +# "no" that will let the operating system flush the output buffer when +# it wants, for better performances (but if you can live with the idea of +# some data loss consider the default persistence mode that's snapshotting), +# or on the contrary, use "always" that's very slow but a bit safer than +# everysec. +# +# More details please check the following article: +# http://antirez.com/post/redis-persistence-demystified.html +# +# If unsure, use "everysec". + +# appendfsync always +appendfsync {{ redis_appendfsync }} +# appendfsync no + +# When the AOF fsync policy is set to always or everysec, and a background +# saving process (a background save or AOF log background rewriting) is +# performing a lot of I/O against the disk, in some Linux configurations +# Redis may block too long on the fsync() call. Note that there is no fix for +# this currently, as even performing fsync in a different thread will block +# our synchronous write(2) call. +# +# In order to mitigate this problem it's possible to use the following option +# that will prevent fsync() from being called in the main process while a +# BGSAVE or BGREWRITEAOF is in progress. +# +# This means that while another child is saving, the durability of Redis is +# the same as "appendfsync none". In practical terms, this means that it is +# possible to lose up to 30 seconds of log in the worst scenario (with the +# default Linux settings). +# +# If you have latency problems turn this to "yes". Otherwise leave it as +# "no" that is the safest pick from the point of view of durability. + +no-appendfsync-on-rewrite {{ redis_no_appendfsync_on_rewrite }} + +# Automatic rewrite of the append only file. +# Redis is able to automatically rewrite the log file implicitly calling +# BGREWRITEAOF when the AOF log size grows by the specified percentage. +# +# This is how it works: Redis remembers the size of the AOF file after the +# latest rewrite (if no rewrite has happened since the restart, the size of +# the AOF at startup is used). +# +# This base size is compared to the current size. If the current size is +# bigger than the specified percentage, the rewrite is triggered. Also +# you need to specify a minimal size for the AOF file to be rewritten, this +# is useful to avoid rewriting the AOF file even if the percentage increase +# is reached but it is still pretty small. +# +# Specify a percentage of zero in order to disable the automatic AOF +# rewrite feature. + +auto-aof-rewrite-percentage {{ redis_auto_aof_rewrite_percentage }} +auto-aof-rewrite-min-size {{ redis_auto_aof_rewrite_min_size }} + +################################ LUA SCRIPTING ############################### + +# Max execution time of a Lua script in milliseconds. +# +# If the maximum execution time is reached Redis will log that a script is +# still in execution after the maximum allowed time and will start to +# reply to queries with an error. +# +# When a long running script exceed the maximum execution time only the +# SCRIPT KILL and SHUTDOWN NOSAVE commands are available. The first can be +# used to stop a script that did not yet called write commands. The second +# is the only way to shut down the server in the case a write commands was +# already issue by the script but the user don't want to wait for the natural +# termination of the script. +# +# Set it to 0 or a negative value for unlimited execution without warnings. +lua-time-limit 5000 + +################################ REDIS CLUSTER ############################### +# +# Normal Redis instances can't be part of a Redis Cluster; only nodes that are +# started as cluster nodes can. In order to start a Redis instance as a +# cluster node enable the cluster support uncommenting the following: +# +# cluster-enabled yes + +# Every cluster node has a cluster configuration file. This file is not +# intended to be edited by hand. It is created and updated by Redis nodes. +# Every Redis Cluster node requires a different cluster configuration file. +# Make sure that instances running in the same system does not have +# overlapping cluster configuration file names. +# +# cluster-config-file nodes-6379.conf + +# Cluster node timeout is the amount of milliseconds a node must be unreachable +# for it to be considered in failure state. +# Most other internal time limits are multiple of the node timeout. +# +# cluster-node-timeout 15000 + +# Cluster slaves are able to migrate to orphaned masters, that are masters +# that are left without working slaves. This improves the cluster ability +# to resist to failures as otherwise an orphaned master can't be failed over +# in case of failure if it has no working slaves. +# +# Slaves migrate to orphaned masters only if there are still at least a +# given number of other working slaves for their old master. This number +# is the "migration barrier". A migration barrier of 1 means that a slave +# will migrate only if there is at least 1 other working slave for its master +# and so forth. It usually reflects the number of slaves you want for every +# master in your cluster. +# +# Default is 1 (slaves migrate only if their masters remain with at least +# one slave). To disable migration just set it to a very large value. +# A value of 0 can be set but is useful only for debugging and dangerous +# in production. +# +# cluster-migration-barrier 1 + +# In order to setup your cluster make sure to read the documentation +# available at http://redis.io web site. + +################################## SLOW LOG ################################### + +# The Redis Slow Log is a system to log queries that exceeded a specified +# execution time. The execution time does not include the I/O operations +# like talking with the client, sending the reply and so forth, +# but just the time needed to actually execute the command (this is the only +# stage of command execution where the thread is blocked and can not serve +# other requests in the meantime). +# +# You can configure the slow log with two parameters: one tells Redis +# what is the execution time, in microseconds, to exceed in order for the +# command to get logged, and the other parameter is the length of the +# slow log. When a new command is logged the oldest one is removed from the +# queue of logged commands. + +# The following time is expressed in microseconds, so 1000000 is equivalent +# to one second. Note that a negative number disables the slow log, while +# a value of zero forces the logging of every command. +slowlog-log-slower-than {{ redis_slowlog_log_slower_than }} + +# There is no limit to this length. Just be aware that it will consume memory. +# You can reclaim memory used by the slow log with SLOWLOG RESET. +slowlog-max-len {{ redis_slowlog_max_len }} + +############################# Event notification ############################## + +# Redis can notify Pub/Sub clients about events happening in the key space. +# This feature is documented at http://redis.io/topics/keyspace-events +# +# For instance if keyspace events notification is enabled, and a client +# performs a DEL operation on key "foo" stored in the Database 0, two +# messages will be published via Pub/Sub: +# +# PUBLISH __keyspace@0__:foo del +# PUBLISH __keyevent@0__:del foo +# +# It is possible to select the events that Redis will notify among a set +# of classes. Every class is identified by a single character: +# +# K Keyspace events, published with __keyspace@__ prefix. +# E Keyevent events, published with __keyevent@__ prefix. +# g Generic commands (non-type specific) like DEL, EXPIRE, RENAME, ... +# $ String commands +# l List commands +# s Set commands +# h Hash commands +# z Sorted set commands +# x Expired events (events generated every time a key expires) +# e Evicted events (events generated when a key is evicted for maxmemory) +# A Alias for g$lshzxe, so that the "AKE" string means all the events. +# +# The "notify-keyspace-events" takes as argument a string that is composed +# by zero or multiple characters. The empty string means that notifications +# are disabled at all. +# +# Example: to enable list and generic events, from the point of view of the +# event name, use: +# +# notify-keyspace-events Elg +# +# Example 2: to get the stream of the expired keys subscribing to channel +# name __keyevent@0__:expired use: +# +# notify-keyspace-events Ex +# +# By default all notifications are disabled because most users don't need +# this feature and the feature has some overhead. Note that if you don't +# specify at least one of K or E, no events will be delivered. +notify-keyspace-events "" + +############################### ADVANCED CONFIG ############################### + +# Hashes are encoded using a memory efficient data structure when they have a +# small number of entries, and the biggest entry does not exceed a given +# threshold. These thresholds can be configured using the following directives. +hash-max-ziplist-entries 512 +hash-max-ziplist-value 64 + +# Similarly to hashes, small lists are also encoded in a special way in order +# to save a lot of space. The special representation is only used when +# you are under the following limits: +list-max-ziplist-entries 512 +list-max-ziplist-value 64 + +# Sets have a special encoding in just one case: when a set is composed +# of just strings that happens to be integers in radix 10 in the range +# of 64 bit signed integers. +# The following configuration setting sets the limit in the size of the +# set in order to use this special memory saving encoding. +set-max-intset-entries 512 + +# Similarly to hashes and lists, sorted sets are also specially encoded in +# order to save a lot of space. This encoding is only used when the length and +# elements of a sorted set are below the following limits: +zset-max-ziplist-entries 128 +zset-max-ziplist-value 64 + +# Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in +# order to help rehashing the main Redis hash table (the one mapping top-level +# keys to values). The hash table implementation Redis uses (see dict.c) +# performs a lazy rehashing: the more operation you run into a hash table +# that is rehashing, the more rehashing "steps" are performed, so if the +# server is idle the rehashing is never complete and some more memory is used +# by the hash table. +# +# The default is to use this millisecond 10 times every second in order to +# active rehashing the main dictionaries, freeing memory when possible. +# +# If unsure: +# use "activerehashing no" if you have hard latency requirements and it is +# not a good thing in your environment that Redis can reply form time to time +# to queries with 2 milliseconds delay. +# +# use "activerehashing yes" if you don't have such hard requirements but +# want to free memory asap when possible. +activerehashing yes + +# The client output buffer limits can be used to force disconnection of clients +# that are not reading data from the server fast enough for some reason (a +# common reason is that a Pub/Sub client can't consume messages as fast as the +# publisher can produce them). +# +# The limit can be set differently for the three different classes of clients: +# +# normal -> normal clients +# slave -> slave clients and MONITOR clients +# pubsub -> clients subscribed to at least one pubsub channel or pattern +# +# The syntax of every client-output-buffer-limit directive is the following: +# +# client-output-buffer-limit +# +# A client is immediately disconnected once the hard limit is reached, or if +# the soft limit is reached and remains reached for the specified number of +# seconds (continuously). +# So for instance if the hard limit is 32 megabytes and the soft limit is +# 16 megabytes / 10 seconds, the client will get disconnected immediately +# if the size of the output buffers reach 32 megabytes, but will also get +# disconnected if the client reaches 16 megabytes and continuously overcomes +# the limit for 10 seconds. +# +# By default normal clients are not limited because they don't receive data +# without asking (in a push way), but just after a request, so only +# asynchronous clients may create a scenario where data is requested faster +# than it can read. +# +# Instead there is a default limit for pubsub and slave clients, since +# subscribers and slaves receive data in a push fashion. +# +# Both the hard or the soft limit can be disabled by setting them to zero. +client-output-buffer-limit normal 0 0 0 +client-output-buffer-limit slave 256mb 64mb 60 +client-output-buffer-limit pubsub 32mb 8mb 60 + +# Redis calls an internal function to perform many background tasks, like +# closing connections of clients in timeout, purging expired keys that are +# never requested, and so forth. +# +# Not all tasks are performed with the same frequency, but Redis checks for +# tasks to perform accordingly to the specified "hz" value. +# +# By default "hz" is set to 10. Raising the value will use more CPU when +# Redis is idle, but at the same time will make Redis more responsive when +# there are many keys expiring at the same time, and timeouts may be +# handled with more precision. +# +# The range is between 1 and 500, however a value over 100 is usually not +# a good idea. Most users should use the default of 10 and raise this up to +# 100 only in environments where very low latency is required. +hz 10 + +# When a child rewrites the AOF file, if the following option is enabled +# the file will be fsync-ed every 32 MB of data generated. This is useful +# in order to commit the file to the disk more incrementally and avoid +# big latency spikes. +aof-rewrite-incremental-fsync yes + diff --git a/ansible/roles/supervisord/defaults/main.yml b/ansible/roles/supervisord/defaults/main.yml new file mode 100644 index 0000000..fee2098 --- /dev/null +++ b/ansible/roles/supervisord/defaults/main.yml @@ -0,0 +1,5 @@ +--- + +supervisord_log_dir: /var/log/supervisor +supervisord_programs_dir: /etc/supervisor/conf.d +supervisord_sock: /var/run/supervisor.sock \ No newline at end of file diff --git a/ansible/roles/supervisord/files/supervisord_centos b/ansible/roles/supervisord/files/supervisord_centos new file mode 100755 index 0000000..c13dd4a --- /dev/null +++ b/ansible/roles/supervisord/files/supervisord_centos @@ -0,0 +1,64 @@ +#!/bin/sh +# +# /etc/rc.d/init.d/supervisord +# +# Supervisor is a client/server system that +# allows its users to monitor and control a +# number of processes on UNIX-like operating +# systems. +# +# chkconfig: - 64 36 +# description: Supervisor Server +# processname: supervisord + +# Source init functions +. /etc/rc.d/init.d/functions + +prog="supervisord" + +prefix="/usr/" +exec_prefix="${prefix}" +prog_bin="${exec_prefix}/bin/supervisord" +PIDFILE="/var/run/$prog.pid" + +start() +{ + echo -n $"Starting $prog: " + daemon $prog_bin --pidfile $PIDFILE + retval=$? + echo + [ $retval -eq 0 ] && [ -f $PIDFILE ] + return $retval +} + +stop() +{ + echo -n $"Shutting down $prog: " + [ -f $PIDFILE ] && killproc $prog || success $"$prog shutdown" + echo +} + +case "$1" in + + start) + start + ;; + + stop) + stop + ;; + + status) + status $prog + ;; + + restart) + stop + start + ;; + + *) + echo "Usage: $0 {start|stop|restart|status}" + ;; + +esac \ No newline at end of file diff --git a/ansible/roles/supervisord/files/supervisord_ubuntu b/ansible/roles/supervisord/files/supervisord_ubuntu new file mode 100644 index 0000000..cf551dd --- /dev/null +++ b/ansible/roles/supervisord/files/supervisord_ubuntu @@ -0,0 +1,154 @@ +#! /bin/sh +### BEGIN INIT INFO +# Provides: supervisord +# Required-Start: $remote_fs +# Required-Stop: $remote_fs +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Example initscript +# Description: This file should be used to construct scripts to be +# placed in /etc/init.d. +### END INIT INFO + +# Author: Dan MacKinlay +# Based on instructions by Bertrand Mathieu +# http://zebert.blogspot.com/2009/05/installing-django-solr-varnish-and.html + +# Do NOT "set -e" + +# PATH should only include /usr/* if it runs after the mountnfs.sh script +PATH=/sbin:/usr/sbin:/bin:/usr/bin +DESC="Description of the service" +NAME=supervisord +DAEMON=/usr/local/bin/supervisord +DAEMON_ARGS="-c /etc/supervisord.conf" +PIDFILE=/var/run/$NAME.pid +SCRIPTNAME=/etc/init.d/$NAME + +# Exit if the package is not installed +[ -x "$DAEMON" ] || exit 0 + +# Read configuration variable file if it is present +[ -r /etc/default/$NAME ] && . /etc/default/$NAME + +# Load the VERBOSE setting and other rcS variables +. /lib/init/vars.sh + +# Define LSB log_* functions. +# Depend on lsb-base (>= 3.0-6) to ensure that this file is present. +. /lib/lsb/init-functions + +# +# Function that starts the daemon/service +# +do_start() +{ + # Return + # 0 if daemon has been started + # 1 if daemon was already running + # 2 if daemon could not be started + start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --test > /dev/null \ + || return 1 + start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON -- \ + $DAEMON_ARGS \ + || return 2 + # Add code here, if necessary, that waits for the process to be ready + # to handle requests from services started subsequently which depend + # on this one. As a last resort, sleep for some time. +} + +# +# Function that stops the daemon/service +# +do_stop() +{ + # Return + # 0 if daemon has been stopped + # 1 if daemon was already stopped + # 2 if daemon could not be stopped + # other if a failure occurred + start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $PIDFILE --name $NAME + RETVAL="$?" + [ "$RETVAL" = 2 ] && return 2 + # Wait for children to finish too if this is a daemon that forks + # and if the daemon is only ever run from this initscript. + # If the above conditions are not satisfied then add some other code + # that waits for the process to drop all resources that could be + # needed by services started subsequently. A last resort is to + # sleep for some time. + start-stop-daemon --stop --quiet --oknodo --retry=0/30/KILL/5 --exec $DAEMON + [ "$?" = 2 ] && return 2 + # Many daemons don't delete their pidfiles when they exit. + rm -f $PIDFILE + return "$RETVAL" +} + +# +# Function that sends a SIGHUP to the daemon/service +# +do_reload() { + # + # If the daemon can reload its configuration without + # restarting (for example, when it is sent a SIGHUP), + # then implement that here. + # + start-stop-daemon --stop --signal 1 --quiet --pidfile $PIDFILE --name $NAME + return 0 +} + +case "$1" in + start) + [ "$VERBOSE" != no ] && log_daemon_msg "Starting $DESC" "$NAME" + do_start + case "$?" in + 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; + 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; + esac + ;; + stop) + [ "$VERBOSE" != no ] && log_daemon_msg "Stopping $DESC" "$NAME" + do_stop + case "$?" in + 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; + 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; + esac + ;; + #reload|force-reload) + # + # If do_reload() is not implemented then leave this commented out + # and leave 'force-reload' as an alias for 'restart'. + # + #log_daemon_msg "Reloading $DESC" "$NAME" + #do_reload + #log_end_msg $? + #;; + restart|force-reload) + # + # If the "reload" option is implemented then remove the + # 'force-reload' alias + # + log_daemon_msg "Restarting $DESC" "$NAME" + do_stop + case "$?" in + 0|1) + do_start + case "$?" in + 0) log_end_msg 0 ;; + 1) log_end_msg 1 ;; # Old process is still running + *) log_end_msg 1 ;; # Failed to start + esac + ;; + *) + # Failed to stop + log_end_msg 1 + ;; + esac + ;; + *) + #echo "Usage: $SCRIPTNAME {start|stop|restart|reload|force-reload}" >&2 + echo "Usage: $SCRIPTNAME {start|stop|restart|force-reload}" >&2 + exit 3 + ;; +esac + +: \ No newline at end of file diff --git a/ansible/roles/supervisord/handlers/main.yml b/ansible/roles/supervisord/handlers/main.yml new file mode 100644 index 0000000..3b9d144 --- /dev/null +++ b/ansible/roles/supervisord/handlers/main.yml @@ -0,0 +1,3 @@ +--- + - name: reread supervisord + shell: supervisorctl reread && supervisorctl update diff --git a/ansible/roles/supervisord/meta/main.yml b/ansible/roles/supervisord/meta/main.yml new file mode 100644 index 0000000..0d4ba0a --- /dev/null +++ b/ansible/roles/supervisord/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - { role: pip } diff --git a/ansible/roles/supervisord/tasks/main.yml b/ansible/roles/supervisord/tasks/main.yml new file mode 100644 index 0000000..0e15618 --- /dev/null +++ b/ansible/roles/supervisord/tasks/main.yml @@ -0,0 +1,56 @@ +--- +- name: ensure supervisord is installed + pip: + name=supervisor + state=present + tags: supervisord + +- name: ensure directories are created + file: + state=directory + dest={{ item }} + mode=0755 + with_items: + - "{{ supervisord_log_dir }}" + - "{{ supervisord_programs_dir }}" + tags: supervisord + +- name: setup configuration + template: + src=supervisord.conf.j2 + dest=/etc/supervisord.conf + tags: supervisord + +- name: setup CentOS supervisord service script + copy: + src=supervisord_centos + dest=/etc/init.d/supervisord + owner=root + group=root + mode=0755 + tags: supervisord + when: ansible_os_family == "RedHat" + +- name: setup Ubuntu supervisord service script + copy: + src=supervisord_ubuntu + dest=/etc/init.d/supervisord + owner=root + group=root + mode=0755 + tags: supervisord + when: ansible_os_family == "Debian" + +- name: ensure supervisord is running + service: + name: supervisord + state: started + tags: supervisord + +- name: reload supervisor + shell: "supervisorctl reload" + tags: supervisord + +- name: set up sc alias + lineinfile: dest="/root/.bashrc" line="alias sc='supervisorctl'" + diff --git a/ansible/roles/supervisord/templates/supervisord.conf.j2 b/ansible/roles/supervisord/templates/supervisord.conf.j2 new file mode 100644 index 0000000..3c9c35f --- /dev/null +++ b/ansible/roles/supervisord/templates/supervisord.conf.j2 @@ -0,0 +1,26 @@ +[unix_http_server] +file={{ supervisord_sock }} ; (the path to the socket file) +chmod=0700 ; sockef file mode (default 0700) + +[supervisord] +logfile={{ supervisord_log_dir }}/supervisord.log ; (main log file;default $CWD/supervisord.log) +logfile_maxbytes=50MB ; maximum size of logfile before rotation +logfile_backups=10 ; number of backed up logfiles +loglevel=info ; critical, error, warn, info, debug, trace, blather +pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid) +childlogdir={{ supervisord_log_dir }} ; ('AUTO' child log dir, default $TEMP) + +; the below section must remain in the config file for RPC +; (supervisorctl/web interface) to work, additional interfaces may be +; added by defining them in separate rpcinterface: sections +[rpcinterface:supervisor] +supervisor.rpcinterface_factory=supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +serverurl=unix://{{ supervisord_sock }} ; use a unix:// URL for a unix socket + +[include] +files={{ supervisord_programs_dir }}/*.conf + +[inet_http_server] +port = *:9001 \ No newline at end of file diff --git a/ansible/roles/zookeeper/defaults/main.yml b/ansible/roles/zookeeper/defaults/main.yml new file mode 100644 index 0000000..3b29a24 --- /dev/null +++ b/ansible/roles/zookeeper/defaults/main.yml @@ -0,0 +1,22 @@ +--- + +zookeeper_version: 3.4.6 + +zookeeper_install_dir: /opt/zookeeper +zookeeper_base_dir: "{{ zookeeper_install_dir }}/default" +zookeeper_conf_dir: "{{ zookeeper_base_dir }}/conf" +zookeeper_data_dir: "{{ zookeeper_base_dir }}/data" +zookeeper_log_dir: "{{ zookeeper_base_dir }}/logs" + + +zookeeper_client_port: 2181 +zookeeper_conn_port: 2888 +zookeeper_lead_port: 3888 +zookeeper_tick_time: 2000 +zookeeper_init_limit: 10 +zookeeper_sync_limit: 5 +zookeeper_maxClientCnxns: 60 +zookeeper_auto_purge_interval: 24 +zookeeper_auto_purge_snap_retain_count: 5 + +zookeeper_source: "http://apache.mirrors.hoobly.com/zookeeper" \ No newline at end of file diff --git a/ansible/roles/zookeeper/handlers/main.yml b/ansible/roles/zookeeper/handlers/main.yml new file mode 100644 index 0000000..3e19693 --- /dev/null +++ b/ansible/roles/zookeeper/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: restart zookeeper + supervisorctl: + name=zookeeper + state=restarted + +- name: wait for zookeeper port + wait_for: + port={{ zookeeper_client_port }} + state=started diff --git a/ansible/roles/zookeeper/meta/main.yml b/ansible/roles/zookeeper/meta/main.yml new file mode 100644 index 0000000..f5b7113 --- /dev/null +++ b/ansible/roles/zookeeper/meta/main.yml @@ -0,0 +1,4 @@ +--- +dependencies: + - { role: supervisord } + - { role: java } diff --git a/ansible/roles/zookeeper/tasks/main.yml b/ansible/roles/zookeeper/tasks/main.yml new file mode 100644 index 0000000..8bcd450 --- /dev/null +++ b/ansible/roles/zookeeper/tasks/main.yml @@ -0,0 +1,98 @@ +--- +- name: create zookeeper install directory + file: + path={{ item }} + state=directory + mode=0744 + with_items: + - "{{ zookeeper_install_dir }}" + tags: zookeeper + +- name: check for existing install + stat: path={{ zookeeper_install_dir }}/zookeeper-{{ zookeeper_version }} + register: zookeeper + tags: zookeeper + +- name: download zookeeper + get_url: + url="{{ zookeeper_source }}/zookeeper-{{ zookeeper_version }}/zookeeper-{{ zookeeper_version }}.tar.gz" + dest=/tmp/zookeeper-{{ zookeeper_version }}.tgz + mode=0644 + validate_certs=no + when: zookeeper.stat.isdir is not defined + tags: zookeeper + +- name: extract zookeeper + unarchive: + src=/tmp/zookeeper-{{ zookeeper_version }}.tgz + dest={{ zookeeper_install_dir }} + copy=no + when: zookeeper.stat.isdir is not defined + tags: zookeeper + +- name: delete temporary zookeeper file + file: + path=/tmp/zookeeper-{{ zookeeper_version }}.tgz + state=absent + ignore_errors: yes + tags: zookeeper + +- name: create zookeeper symlink + file: + path={{ zookeeper_install_dir }}/default + state=link + src={{ zookeeper_install_dir }}/zookeeper-{{ zookeeper_version }} + tags: zookeeper + +- name: create zookeeper directories + file: + path={{ item }} + state=directory + mode=0744 + with_items: + - "{{ zookeeper_conf_dir }}" + - "{{ zookeeper_log_dir }}" + - "{{ zookeeper_data_dir }}" + tags: zookeeper + +- name: copy supervisord config + template: + src=zookeeper-supervisord.conf.j2 + dest={{ supervisord_programs_dir }}/zookeeper-supervisord.conf + mode=0644 + notify: + - reread supervisord + tags: zookeeper + +- name: setup zoo.cfg + template: + dest={{ zookeeper_conf_dir }}/zoo.cfg + src=zoo.cfg.j2 + notify: + - restart zookeeper + - wait for zookeeper port + tags: zookeeper + +# - name: check for existing data directory +# stat: path={{ zookeeper_data_dir }} +# register: data_exists +# tags: zookeeper + +# - name: initialize data directory +# shell: "{{service zookeeper-server init}}" +# when: data_exists.stat.isdir is not defined +# tags: zookeeper + +- name: setup myid file for zookeeper + template: + dest={{zookeeper_data_dir}}/myid + src=myid.j2 + notify: + - restart zookeeper + - wait for zookeeper port + tags: zookeeper + +- name: set up aliases + lineinfile: dest="~/.bashrc" line="export ZOO={{ zookeeper_base_dir }}" + lineinfile: dest="~/.bashrc" line="alias zkcli='{{ zookeeper_base_dir }}/bin/zkCli.sh'" + tags: alias diff --git a/ansible/roles/zookeeper/templates/myid.j2 b/ansible/roles/zookeeper/templates/myid.j2 new file mode 100644 index 0000000..bbd57bc --- /dev/null +++ b/ansible/roles/zookeeper/templates/myid.j2 @@ -0,0 +1,5 @@ +{% for host in zookeeper_host_list %} + {%- if host == inventory_hostname -%} +{{ loop.index }} + {%- endif -%} +{% endfor %} diff --git a/ansible/roles/zookeeper/templates/zoo.cfg.j2 b/ansible/roles/zookeeper/templates/zoo.cfg.j2 new file mode 100644 index 0000000..2fdfbf4 --- /dev/null +++ b/ansible/roles/zookeeper/templates/zoo.cfg.j2 @@ -0,0 +1,60 @@ +# http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html + +# The number of milliseconds of each tick +tickTime={{ zookeeper_tick_time|default(2000) }} + +# The number of ticks that the initial synchronization phase can take +initLimit={{ zookeeper_init_limit|default(10) }} + +# The number of ticks that can pass between sending a request and getting an acknowledgement +syncLimit={{ zookeeper_sync_limit|default(5) }} + +# the directory where the snapshot is stored. +dataDir={{ zookeeper_data_dir|default('/var/lib/zookeeper') }} + +# Place the dataLogDir to a separate physical disc for better performance +# dataLogDir=/disk2/zookeeper + +# the port at which the clients will connect +clientPort={{ zookeeper_client_port|default(2181) }} + +# Maximum number of clients that can connect from one client +maxClientCnxns={{ zookeeper_maxClientCnxns|default(50) }} + +# specify all zookeeper servers +# The fist port is used by followers to connect to the leader +# The second one is used for leader election +{% for host in zookeeper_host_list %} +server.{{ loop.index }}={{ host }}:{{ zookeeper_conn_port }}:{{ zookeeper_lead_port }} +{% endfor %} + + +# To avoid seeks ZooKeeper allocates space in the transaction log file in +# blocks of preAllocSize kilobytes. The default block size is 64M. One reason +# for changing the size of the blocks is to reduce the block size if snapshots +# are taken more often. (Also, see snapCount). +#preAllocSize=65536 + +# Clients can submit requests faster than ZooKeeper can process them, +# especially if there are a lot of clients. To prevent ZooKeeper from running +# out of memory due to queued requests, ZooKeeper will throttle clients so that +# there is no more than globalOutstandingLimit outstanding requests in the +# system. The default limit is 1,000.ZooKeeper logs transactions to a +# transaction log. After snapCount transactions are written to a log file a +# snapshot is started and a new transaction log file is started. The default +# snapCount is 10,000. +#snapCount=1000 + +# If this option is defined, requests will be will logged to a trace file named +# traceFile.year.month.day. +#traceFile= + +# Leader accepts client connections. Default value is "yes". The leader machine +# coordinates updates. For higher update throughput at thes slight expense of +# read throughput the leader can be configured to not accept clients and focus +# on coordination. +#leaderServes=yes + +# Autopurge, purge every day +autopurge.purgeInterval={{ zookeeper_auto_purge_interval|default(24) }} +autopurge.snapRetainCount={{ zookeeper_auto_purge_snap_retain_count|default(5) }} diff --git a/ansible/roles/zookeeper/templates/zookeeper-supervisord.conf.j2 b/ansible/roles/zookeeper/templates/zookeeper-supervisord.conf.j2 new file mode 100644 index 0000000..b52e501 --- /dev/null +++ b/ansible/roles/zookeeper/templates/zookeeper-supervisord.conf.j2 @@ -0,0 +1,5 @@ +[program:zookeeper] +command={{ zookeeper_base_dir }}/bin/zkServer.sh start-foreground +autostart=true +autorestart=true +stopsignal=KILL diff --git a/ansible/sc.inventory b/ansible/sc.inventory new file mode 100644 index 0000000..afdccd5 --- /dev/null +++ b/ansible/sc.inventory @@ -0,0 +1,17 @@ +# Scrapy Cluster inventory + +default ansible_ssh_host=192.168.33.99 + +[kafka] +scdev + +[zookeeper] +scdev + +[redis] +scdev + +[scrapy-cluster:children] +kafka +zookeeper +redis \ No newline at end of file diff --git a/ansible/scrapy-cluster.yml b/ansible/scrapy-cluster.yml new file mode 100644 index 0000000..dd40513 --- /dev/null +++ b/ansible/scrapy-cluster.yml @@ -0,0 +1,21 @@ +--- +# Update apt-cache and install the latest Miniconda +- hosts: all + sudo: yes + tasks: + - name: Update apt cache + apt: update-cache=yes + roles: + - miniconda + +# Install kafka, zookeeper, redis +- include: kafka.yml +- include: zookeeper.yml +- include: redis.yml + +# Restart Kafka because its finicky like that +- hosts: scdev + sudo: yes + tasks: + - name: Restart kafka + supervisorctl: name='kafka' state=restarted \ No newline at end of file diff --git a/ansible/zookeeper.yml b/ansible/zookeeper.yml new file mode 100644 index 0000000..de2962e --- /dev/null +++ b/ansible/zookeeper.yml @@ -0,0 +1,11 @@ +--- +- name: ZooKeeper Ensemble + hosts: zookeeper + + sudo: yes + + vars: + - zookeeper_host_list: "{{ groups['zookeeper'] }}" + + roles: + - zookeeper diff --git a/requirements.txt b/requirements.txt index d899bdf..74256f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,7 @@ requests-oauthlib>=0.3.2 redis kafka-python python-dateutil -click \ No newline at end of file +click +scutils +pymysql +nose diff --git a/scripts/rule-extract.py b/scripts/rule_extract.py similarity index 73% rename from scripts/rule-extract.py rename to scripts/rule_extract.py index 6a1e956..63374aa 100644 --- a/scripts/rule-extract.py +++ b/scripts/rule_extract.py @@ -105,7 +105,7 @@ def _fix_follow(raw_rules): new_rules = [] for idx, d in enumerate(raw_rules): # Twitter rules only only a single twitter id - m = re.search(r'(\d{7,10})', d['value']) + m = re.search(r'(\d{7,})', d['value']) if m: d['value'] = m.group(1) new_rules.append(d) @@ -129,28 +129,42 @@ def _fix_track(raw_rules): return new_rules -def send_to_redis(traptor_type, - rules, - host=redis_settings['HOST'], - port=redis_settings['PORT'], - db=redis_settings['DB'] - ): - """ Send rules to Redis""" +class RulesToRedis(object): + """ Class to connect to redis and send traptor rules. """ + def __init__(self, + host=redis_settings['HOST'], + port=redis_settings['PORT'], + db=redis_settings['DB'] + ): + + self.host = host + self.port = port + self.db = db - # Set up API limitation checks - if traptor_type == 'follow': - rule_max = 5000 - elif traptor_type == 'track': - rule_max = 400 - else: - raise ValueError('{} is not a valid traptor_type'.format(traptor_type)) + def rule_max(self, traptor_type): + """ Send the rule_max based on what traptor_type is passed in. """ + if traptor_type == 'follow': + self._rule_max = 5000 + elif traptor_type == 'track': + self._rule_max = 400 + else: + raise ValueError('{} is not a valid traptor_type'.format( + traptor_type)) - r = redis.StrictRedis(host=host, port=port, db=db) + return self._rule_max - for idx, d in enumerate(rules): - crawler_num = idx / rule_max - logging.debug('idx: {}, crawler_num: {}'.format(idx, crawler_num)) - r.hmset('traptor-{0}:{1}:{2}'.format(traptor_type, crawler_num, idx), d) + def connect(self): + """ Connect to a Redis database. """ + self.redis_conn = redis.StrictRedis(host=self.host, port=self.port, + db=self.db) + + def send_rules(self, traptor_type, rules): + """ Send rules out to Redis with the appropriate key, value format. """ + for idx, d in enumerate(rules): + crawler_num = idx / self.rule_max(traptor_type) + logging.debug('idx: {}, crawler_num: {}'.format(idx, crawler_num)) + self.redis_conn.hmset('traptor-{0}:{1}:{2}'.format( + traptor_type, crawler_num, idx), d) if __name__ == '__main__': @@ -162,4 +176,7 @@ def send_to_redis(traptor_type, for i in rules: logging.debug(i) - send_to_redis(sys.argv[1], rules) + + rc = RulesToRedis() + rc.connect() + rc.send_rules(sys.argv[1], rules) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/offline_tests.py b/tests/offline_tests.py new file mode 100644 index 0000000..57ee832 --- /dev/null +++ b/tests/offline_tests.py @@ -0,0 +1,40 @@ +# from nose.tools import assert_equal +from unittest import TestCase +from mock import MagicMock + +from scripts.rule_extract import CooperRules, RulesToRedis +from sample_rules import FOLLOW_RULES, TRACK_RULES + + +class TestRuleExtract(TestCase): + + def setUp(self): + + self.follow_rules = FOLLOW_RULES + self.track_rules = TRACK_RULES + + self.fixed_follow_rules = [ + {'tag': 'marketing', 'value': '345345234'}, + {'tag': 'random', 'value': '34534509889'} + ] + self.fixed_track_rules = [ + {'tag': 'short link', 'value': 'dump to'}, + {'tag': 'ref_keywords', 'value': 'something random'} + ] + + def test_fix_follow(self): + """ Testing that the "follow" rules get normalized to Twitter format. + """ + fixed_rules = CooperRules._fix_follow(self.follow_rules) + self.assertEqual(fixed_rules, self.fixed_follow_rules) + + def test_fix_track(self): + """ Testing that the "track" rules get normalized to Twitter format. + """ + fixed_rules = CooperRules._fix_track(self.track_rules) + self.assertEqual(fixed_rules, self.fixed_track_rules) + + def test_send_to_redis(self): + """ Testing the logic that sends the rules to Redis. """ + # rc1 = RulesToRedis('track', self.fixed_track_rules) + raise NotImplementedError \ No newline at end of file diff --git a/tests/online_tests.py b/tests/online_tests.py new file mode 100644 index 0000000..dc8b728 --- /dev/null +++ b/tests/online_tests.py @@ -0,0 +1,108 @@ +# from nose.tools import assert_equal +import json +from unittest import TestCase +from mock import MagicMock +from kafka import SimpleProducer +from redis import StrictRedis, ConnectionError + +from scripts.rule_extract import CooperRules, RulesToRedis +from sample_rules import FOLLOW_RULES, TRACK_RULES +from traptor.traptor import (MyClient, get_redis_twitter_rules, + create_kafka_producer, + create_birdy_stream, clean_tweet_data, run, + tweet_time_to_iso + ) + +from traptor.settings import (KAFKA_HOSTS, KAFKA_TOPIC, APIKEYS, TRAPTOR_ID, + TRAPTOR_TYPE, REDIS_HOST) +from traptor.birdy.twitter import StreamResponse + + +class TestTwitterRules(TestCase): + """ Test that we can read the rules from Redis and return a twitter + ruel string. + """ + def setUp(self): + + self.fixed_follow_rules = CooperRules._fix_follow(FOLLOW_RULES) + self.fixed_track_rules = CooperRules._fix_track(TRACK_RULES) + + # Need to test RulesToRedis class somewhere + r = RulesToRedis() + r.connect() + r.redis_conn.flushall() + r.send_rules('track', self.fixed_track_rules) + r.send_rules('follow', self.fixed_follow_rules) + + self.follow_tw_rules = [ + {'tag': 'random', 'value': '34534509889'}, + {'tag': 'marketing', 'value': '345345234'} + ] + self.track_tw_rules = [ + {'tag': 'short link', 'value': 'dump to'}, + {'tag': 'ref_keywords', 'value': 'something random'} + ] + + + def test_follow(self): + """ Test that FOLLOW rules parse correctly per the Twitter API. """ + tw_rules = get_redis_twitter_rules('follow', '0', 'localhost') + self.assertEqual(sorted(tw_rules), sorted(self.follow_tw_rules)) + + def test_track(self): + """ Test that TRACK rules parse correctly per the Twitter API. """ + tw_rules = get_redis_twitter_rules('track', '0', 'localhost') + self.assertEqual(sorted(tw_rules), sorted(self.track_tw_rules)) + + +class TestKafkaProducer(TestCase): + def test_producer(self): + """ Test that we can connect to Kafka and create a producer. """ + kafka_producer = create_kafka_producer('localhost:9092', 'traptor123') + self.assertIsInstance(kafka_producer, SimpleProducer) + + +class TestBirdyStream(TestCase): + """ Test that we can create a birdy stream. + To do: mock the twitter connection so that we don't get rate limited. + """ + def setUp(self): + self.track_tw_rules = 'dump to,something random' + self.follow_tw_rules = '345345234,34534509889' + + def test_birdy_follow(self): + """ Test that we can create a birdy FOLLOW stream. """ + birdy_stream = create_birdy_stream(self.follow_tw_rules, APIKEYS, + 'follow', '0') + self.assertIsInstance(birdy_stream, StreamResponse) + + def test_birdy_track(self): + """ Test that we can create a birdy TRACK stream. """ + birdy_stream = create_birdy_stream(self.track_tw_rules, APIKEYS, + 'track', '1') + self.assertIsInstance(birdy_stream, StreamResponse) + +class TestCleanTweetData(TestCase): + def setUp(self): + self.twitter_time = "Wed Nov 25 19:36:51 +0000 2015" + self.iso_time = "2015-11-25T19:36:51+00:00" + + with open('tests/sample_raw_tweets.json') as f: + self.raw_tweets = [json.loads(line) for line in f] + + with open('tests/sample_cleaned_tweets.json') as f: + self.cleaned_tweets = [json.loads(line) for line in f] + + def test_tweet_time_to_iso(self): + """ Test that the default twitter time format is converted to ISO. """ + self.assertEqual(tweet_time_to_iso(self.twitter_time), self.iso_time) + + def test_clean_tweet_data(self): + """ That that the raw tweet data is cleaned according to the expected + format. + """ + self.assertItemsEqual(clean_tweet_data(self.raw_tweets[0]), + self.cleaned_tweets[0]) + + def test_add_rule_tags(self): + raise NotImplementedError diff --git a/tests/sample_cleaned_tweets.json b/tests/sample_cleaned_tweets.json new file mode 100644 index 0000000..970aed5 --- /dev/null +++ b/tests/sample_cleaned_tweets.json @@ -0,0 +1 @@ +{"contributors": null, "truncated": false, "text": "https://t.co/zNiGVvqpvO \u0627\u0636\u062e\u0645 \u0627\u0641\u0644\u0627\u0645 #\u0627\u0644\u062f\u0648\u0644\u0629_\u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629 (https://t.co/OtYI7VpLPk) #\u062d\u0627\u0626\u0644 #\u0634\u0631\u0648\u0631\u0629 #\u062c\u0627\u0632\u0627\u0646 #\u0627\u0644\u0645\u062f\u064a\u0646\u0629_\u0627\u0644\u0645\u0646\u0648\u0631\u0629 #\u062a\u0628\u0648\u0643 #\u0645\u0635\u0631 RT Promotio\u2026", "is_quote_status": false, "in_reply_to_status_id": null, "id": 669600654558654471, "favorite_count": 0, "source": "IFTTT", "retweeted": false, "coordinates": null, "timestamp_ms": "1448480211282", "entities": {"user_mentions": [], "symbols": [], "hashtags": [{"indices": [35, 52], "text": "\u0627\u0644\u062f\u0648\u0644\u0629_\u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629"}, {"indices": [79, 84], "text": "\u062d\u0627\u0626\u0644"}, {"indices": [85, 91], "text": "\u0634\u0631\u0648\u0631\u0629"}, {"indices": [92, 98], "text": "\u062c\u0627\u0632\u0627\u0646"}, {"indices": [99, 115], "text": "\u0627\u0644\u0645\u062f\u064a\u0646\u0629_\u0627\u0644\u0645\u0646\u0648\u0631\u0629"}, {"indices": [116, 121], "text": "\u062a\u0628\u0648\u0643"}, {"indices": [122, 126], "text": "\u0645\u0635\u0631"}], "urls": [{"url": "https://t.co/zNiGVvqpvO", "indices": [0, 23], "expanded_url": "https://twitter.com/lkajsfhafh/status/627139474884042752/photo/1", "display_url": "pic.twitter.com/zNiGVvqpvO"}, {"url": "https://t.co/OtYI7VpLPk", "indices": [54, 77], "expanded_url": "https://dump.to/e9s", "display_url": "dump.to/e9s"}]}, "in_reply_to_screen_name": null, "id_str": "669600654558654471", "retweet_count": 0, "in_reply_to_user_id": null, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 3234202126, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/635516579502006272/UBlihRXO_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 48, "profile_sidebar_border_color": "C0DEED", "id_str": "3234202126", "profile_background_color": "C0DEED", "listed_count": 1, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": -28800, "statuses_count": 193901, "description": null, "friends_count": 0, "location": null, "profile_link_color": "0084B4", "profile_image_url": "http://pbs.twimg.com/profile_images/635516579502006272/UBlihRXO_normal.jpg", "following": null, "geo_enabled": false, "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "\u062d\u0642\u0627\u0626\u0642 \u064a\u062e\u0641\u064a\u0647\u0627 \u0627\u0644\u0627\u0639\u0644\u0627\u0645", "lang": "en", "profile_background_tile": false, "favourites_count": 0, "screen_name": "klhhdahdo", "notifications": null, "url": null, "created_at": "Mon May 04 19:59:22 +0000 2015", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": true, "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "und", "created_at": "2015-11-25T19:36:51+00:00", "filter_level": "low", "in_reply_to_status_id_str": null, "place": null} diff --git a/tests/sample_raw_tweets.json b/tests/sample_raw_tweets.json new file mode 100644 index 0000000..32682ee --- /dev/null +++ b/tests/sample_raw_tweets.json @@ -0,0 +1 @@ +{"contributors": null, "truncated": false, "text": "https://t.co/zNiGVvqpvO \u0627\u0636\u062e\u0645 \u0627\u0641\u0644\u0627\u0645 #\u0627\u0644\u062f\u0648\u0644\u0629_\u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629 (https://t.co/OtYI7VpLPk) #\u062d\u0627\u0626\u0644 #\u0634\u0631\u0648\u0631\u0629 #\u062c\u0627\u0632\u0627\u0646 #\u0627\u0644\u0645\u062f\u064a\u0646\u0629_\u0627\u0644\u0645\u0646\u0648\u0631\u0629 #\u062a\u0628\u0648\u0643 #\u0645\u0635\u0631 RT Promotio\u2026", "is_quote_status": false, "in_reply_to_status_id": null, "id": 669600654558654471, "favorite_count": 0, "source": "IFTTT", "retweeted": false, "coordinates": null, "timestamp_ms": "1448480211282", "entities": {"user_mentions": [], "symbols": [], "hashtags": [{"indices": [35, 52], "text": "\u0627\u0644\u062f\u0648\u0644\u0629_\u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629"}, {"indices": [79, 84], "text": "\u062d\u0627\u0626\u0644"}, {"indices": [85, 91], "text": "\u0634\u0631\u0648\u0631\u0629"}, {"indices": [92, 98], "text": "\u062c\u0627\u0632\u0627\u0646"}, {"indices": [99, 115], "text": "\u0627\u0644\u0645\u062f\u064a\u0646\u0629_\u0627\u0644\u0645\u0646\u0648\u0631\u0629"}, {"indices": [116, 121], "text": "\u062a\u0628\u0648\u0643"}, {"indices": [122, 126], "text": "\u0645\u0635\u0631"}], "urls": [{"url": "https://t.co/zNiGVvqpvO", "indices": [0, 23], "expanded_url": "https://twitter.com/lkajsfhafh/status/627139474884042752/photo/1", "display_url": "pic.twitter.com/zNiGVvqpvO"}, {"url": "https://t.co/OtYI7VpLPk", "indices": [54, 77], "expanded_url": "https://dump.to/e9s", "display_url": "dump.to/e9s"}]}, "in_reply_to_screen_name": null, "id_str": "669600654558654471", "retweet_count": 0, "in_reply_to_user_id": null, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 3234202126, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/635516579502006272/UBlihRXO_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 48, "profile_sidebar_border_color": "C0DEED", "id_str": "3234202126", "profile_background_color": "C0DEED", "listed_count": 1, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": -28800, "statuses_count": 193901, "description": null, "friends_count": 0, "location": null, "profile_link_color": "0084B4", "profile_image_url": "http://pbs.twimg.com/profile_images/635516579502006272/UBlihRXO_normal.jpg", "following": null, "geo_enabled": false, "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "\u062d\u0642\u0627\u0626\u0642 \u064a\u062e\u0641\u064a\u0647\u0627 \u0627\u0644\u0627\u0639\u0644\u0627\u0645", "lang": "en", "profile_background_tile": false, "favourites_count": 0, "screen_name": "klhhdahdo", "notifications": null, "url": null, "created_at": "Mon May 04 19:59:22 +0000 2015", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": true, "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "und", "created_at": "Wed Nov 25 19:36:51 +0000 2015", "filter_level": "low", "in_reply_to_status_id_str": null, "place": null} diff --git a/tests/sample_rules.py b/tests/sample_rules.py new file mode 100644 index 0000000..c43d045 --- /dev/null +++ b/tests/sample_rules.py @@ -0,0 +1,25 @@ +FOLLOW_RULES = ( + { + 'tag': 'marketing', + 'value': 'from:345345234 OR someguy' + }, + { + 'tag': 'sales', + 'value': 'someguy' + }, + { + 'tag': 'random', + 'value': 'from:34534509889 OR someguy' + } +) + +TRACK_RULES = ( + { + 'tag': 'short link', + 'value': 'url_contains: dump.to' + }, + { + 'tag': 'ref_keywords', + 'value': 'something random' + } +) diff --git a/traptor/traptor.py b/traptor/traptor.py index d3b4f26..5dfea9a 100644 --- a/traptor/traptor.py +++ b/traptor/traptor.py @@ -17,7 +17,7 @@ from settings import (KAFKA_HOSTS, KAFKA_TOPIC, APIKEYS, TRAPTOR_ID, TRAPTOR_TYPE, REDIS_HOST) -logger = LogObject(name='traptor', level='DEBUG') +logger = LogObject(name='traptor', level='INFO') # Override the default JSONobject @@ -96,6 +96,7 @@ def create_kafka_producer(kafka_hosts=KAFKA_HOSTS, kafka_topic=KAFKA_TOPIC): def create_birdy_stream(rules, + apikeys=APIKEYS, traptor_type=TRAPTOR_TYPE, traptor_id=TRAPTOR_ID, ): @@ -132,15 +133,14 @@ def create_birdy_stream(rules, logger.critical('That traptor type has not been implemented yet') sys.exit(3) +def tweet_time_to_iso(tweet_time): + return parser.parse(tweet_time).isoformat() def clean_tweet_data(tweet_dict): """ Do any pre-processing to raw tweet data before passing on to Kafka """ - def tweet_time_to_iso(tweet_time): - return parser.parse(tweet_time).isoformat() - if tweet_dict.get('created_at'): tweet_dict['created_at'] = tweet_time_to_iso(tweet_dict['created_at']) return tweet_dict @@ -151,11 +151,11 @@ def tweet_time_to_iso(tweet_time): def run(test): # Grab a list of {tag:, value:} rules rules = get_redis_twitter_rules() - logger.info(rules) + logger.debug(rules) # Concatenate all of the rule['value'] fields rules_str = ','.join([rule['value'] for rule in rules]) - logger.info(rules_str) + logger.debug(rules_str) if not test: # Set up Kafka producer @@ -168,7 +168,7 @@ def run(test): # Iterate through the twitter results for _data in birdyclient.stream(): - # logger.debug('Raw Data: {0}'.format(json.dumps(_data))) + logger.debug('Raw Data: {0}'.format(json.dumps(_data))) # Do tweet data pre-processing data = clean_tweet_data(_data)