diff --git a/apps/addons/management/commands/download_counts_from_file.py b/apps/addons/management/commands/download_counts_from_file.py new file mode 100644 index 000000000000..9ec3e300d244 --- /dev/null +++ b/apps/addons/management/commands/download_counts_from_file.py @@ -0,0 +1,106 @@ +from datetime import datetime +from optparse import make_option + +from django.core.management.base import BaseCommand, CommandError + +import commonware.log + +from addons.models import File +# TODO: use DownloadCount when the script is proven to work correctly. +from stats.models import update_inc, DownloadCountTmp as DownloadCount + + +log = commonware.log.getLogger('adi.downloadcountsfromfile') + + +class Command(BaseCommand): + """Update download count metrics from a file in the database. + + Usage: + ./manage.py download_counts_from_file --date=YYYY-MM-DD + + + We get a row for each "addon download" request, in this format: + + + + There is one DownloadCount entry per addon per day, and each field holds + the json-ified dict of keys/counters. + + Eg, for the above request: + + addon: + count: + date: + src: {'dp-btn-primary': 1} + + """ + help = __doc__ + + option_list = BaseCommand.option_list + ( + make_option('--date', action='store', type='string', + dest='date', help='Date in the YYYY-MM-DD format.'), + make_option('--separator', action='store', type='string', default='\t', + dest='separator', help='Field separator in file.'), + ) + + def handle(self, *args, **options): + start = datetime.now() # Measure the time it takes to run the script. + day = options['date'] + if not day: + raise CommandError('You must specify a --date parameter in the ' + ' YYYY-MM-DD format.') + sep = options['separator'] + filename = args[0] + # First, make sure we don't have any existing counts for the same day, + # or it would just increment again the same data. + DownloadCount.objects.filter(date=day).delete() + + # Memoize the files to addon relations and the DownloadCounts. + download_counts = {} + # Perf: preload all the files once and for all. + # This builds a dict where each key (the file_id we get from the hive + # query) has the addon_id as value. + files_to_addon = dict(File.objects.values_list('id', + 'version__addon_id')) + + with open(filename) as count_file: + for index, line in enumerate(count_file): + if index and (index % 10000) == 0: + log.info('Processed %s lines' % index) + + splitted = line[:-1].split(sep) + + if len(splitted) != 3: + log.debug('Badly formatted row: %s' % line) + continue + + counter, file_id, src = splitted + try: + file_id, counter = int(file_id), int(counter) + except ValueError: # Badly formatted? Drop. + continue + + # Does this file exist? + if file_id in files_to_addon: + addon_id = files_to_addon[file_id] + else: + log.info('File with id: %s not found' % file_id) + continue + + # Memoize the DownloadCount. + if addon_id in download_counts: + dc = download_counts[addon_id] + else: + dc = DownloadCount(date=day, addon_id=addon_id, count=0) + download_counts[addon_id] = dc + + # We can now fill the DownloadCount object. + dc.count += counter + dc.sources = update_inc(dc.sources, src, counter) + + # Create in bulk: this is much faster. + DownloadCount.objects.bulk_create(download_counts.values(), 100) + total_time = (datetime.now() - start).total_seconds() + log.info('Processed a total of %s lines' % (index + 1)) + log.debug('Total processing time: %s seconds' % total_time) diff --git a/apps/addons/management/commands/download_metrics.py b/apps/addons/management/commands/download_metrics.py new file mode 100644 index 000000000000..5b7181f25939 --- /dev/null +++ b/apps/addons/management/commands/download_metrics.py @@ -0,0 +1,191 @@ +import os +from datetime import datetime +from optparse import make_option + +import pyhs2 +from pyhs2 import connections, cursor +from pyhs2.TCLIService.ttypes import TFetchOrientation, TFetchResultsReq + +from django.core.management.base import BaseCommand, CommandError + +import commonware.log + + +log = commonware.log.getLogger('adi.export') + + +fetch_time = 0 # Used for time reporting. + + +# This class and the following are needed because the pyhs2 lib doesn't return +# a generator, but a full list! Doing this allows us to return a generator. +class YieldedCursor(cursor.Cursor): + """Override the fetch method to return a generator.""" + + def fetch(self): + max_rows = int(os.getenv('MAX_HIVE_ROWS', 10000)) + fetchReq = TFetchResultsReq(operationHandle=self.operationHandle, + orientation=TFetchOrientation.FETCH_NEXT, + maxRows=max_rows) + + while True: + global fetch_time + # Measure the time it takes to retrieve from hive. + start = datetime.now() + resultsRes = self.client.FetchResults(fetchReq) + fetch_time += (datetime.now() - start).total_seconds() + if len(resultsRes.results.rows) == 0: + break + for row in resultsRes.results.rows: + rowData = [] + for i, col in enumerate(row.colVals): + rowData.append(pyhs2.cursor.get_value(col)) + yield rowData + + +class ClevererConnection(connections.Connection): + """Return our own YieldedCursor. + + Yeah, it seems pysh2 isn't dealing with so much data... so there's just a + huge list returned. + + """ + + def cursor(self): + return YieldedCursor(self.client, self.session) + + +class Command(BaseCommand): + """Execute queries on HIVE, and store the results on disk. + + Query the downloads or updates requests for addons on HIVE. These will then + be processed by other scripts to store counts in the DownloadCount and + UploadCount objects. + + Usage: + ./manage.py download_metrics --date YYYY-MM-DD \ + --with-updates --with-downloads + + Set a ``MAX_HIVE_ROWS`` environment variable to minimize the network + latency (default is 10000 rows fetched from hive at once), but it will + increase the memory footprint. + + """ + help = __doc__ + + option_list = BaseCommand.option_list + ( + make_option('--output', action='store', type='string', + dest='filename', help='Filename to output to.'), + make_option('--separator', action='store', type='string', default='\t', + dest='separator', help='Field separator in file.'), + make_option('--date', action='store', type='string', + dest='date', help='Date in the YYYY-MM-DD format.'), + make_option('--with-updates', action='store_true', default=False, + dest='with_updates', help='Store update requests.'), + make_option('--with-downloads', action='store_true', default=False, + dest='with_downloads', help='Store download requests.'), + make_option('--limit', action='store', type='int', + dest='limit', help='(debug) max number of requests.'), + ) + + def handle(self, *args, **options): + day = options['date'] + if not day: + raise CommandError('You must specify a --date parameter in the ' + ' YYYY-MM-DD format.') + filename = options['filename'] + if filename is None: + filename = day + sep = options['separator'] + limit = options['limit'] + with_updates = options['with_updates'] + with_downloads = options['with_downloads'] + if not with_updates and not with_downloads: + raise CommandError('Please specify at least one of --with-updates ' + 'or --with-downloads.') + + with ClevererConnection(host='peach-gw.peach.metrics.scl3.mozilla.com', + port=10000, + user='aphadke', + password='', + authMechanism='PLAIN') as conn: + num_reqs = 0 + with conn.cursor() as cur: + start = datetime.now() # Measure the time to run the script. + if with_downloads: + num_reqs += self.process_downloads( + cur, day, filename, sep=sep, limit=limit) + if with_updates: + num_reqs += self.process_updates( + cur, day, filename, sep=sep, limit=limit) + + total_time = (datetime.now() - start).total_seconds() + log.info('Stored a total of %s requests' % num_reqs) + log.debug('Total processing time: %s seconds' % total_time) + log.debug('Time spent fetching data from hive over the network: %s' % + fetch_time) + + def process_updates(self, cur, day, filename, sep, limit=None): + """Query the update requests and store them on disk.""" + limit = ('limit %s' % limit) if limit else '' + # We use "concat" and http://a.com in the following request to have + # fully qualified URLs. + cur.execute("select count(1), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'id'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'version'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'status'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'appID'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'appVersion'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'appOS'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'locale'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'updateType') " + "from v2_raw_logs " + "where domain='versioncheck.addons.mozilla.org' " + " and ds='%s' " + " and request_url like '/update/VersionCheck.php?%%' " + "group by " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'id'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'version'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'status'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'appID'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'appVersion'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'appOS'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'locale'), " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'updateType') " + "%s" % (day, limit)) + + return self.to_file(cur, '%s.updates' % filename, sep) + + def process_downloads(self, cur, day, filename, sep, limit=None): + """Query the download requests and store them on disk.""" + limit = ('limit %s' % limit) if limit else '' + # We use "concat" and http://a.com in the following request to have + # fully qualified URLs. + cur.execute("select count(1), " + " split(request_url,'/')[4], " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'src') " + "from v2_raw_logs " + "where domain='addons.mozilla.org' " + " and ds='%s' " + " and request_url like '/firefox/downloads/file/%%' " + " and !(parse_url(concat('http://a.com',request_url), 'QUERY', 'src') LIKE 'sync') " + "group by " + " split(request_url,'/')[4], " + " parse_url(concat('http://a.com',request_url), 'QUERY', 'src') " + "%s" % (day, limit)) + return self.to_file(cur, '%s.downloads' % filename, sep) + + def to_file(self, cur, filename, sep): + log.info('Storing hive results in %s' % filename) + count = 0 + with open(filename, 'w') as f: + for row in cur.fetch(): + count += 1 + if (count % 100000) == 0: + log.info('Processed %s requests' % count) + if None in row: # Incomplete result: skip. + continue + f.write(sep.join([str(col) for col in row])) + f.write('\n') + + return count diff --git a/apps/addons/management/commands/update_counts_from_file.py b/apps/addons/management/commands/update_counts_from_file.py new file mode 100644 index 000000000000..0ffc5b0c0ac7 --- /dev/null +++ b/apps/addons/management/commands/update_counts_from_file.py @@ -0,0 +1,150 @@ +from datetime import datetime +from optparse import make_option + +from django.core.management.base import BaseCommand, CommandError + +import commonware.log + +from addons.models import Addon +# TODO: use UpdateCount when the script is proven to work correctly. +from stats.models import update_inc, UpdateCountTmp as UpdateCount + + +log = commonware.log.getLogger('adi.updatecountsfromfile') + + +class Command(BaseCommand): + """Update check versions count metrics from a file in the database. + + Usage: + ./manage.py update_counts_from_file --date=YYYY-MM-DD + + + We get a row for each "version check" request, in this format: + + + + There is one UpdateCount entry per addon per day, and each field holds the + json-ified dict of keys/counters. + + Eg, for the above request: + + addon: + count: + date: + versions: {'0.6.2': 1} + statuses: {'userDisabled': 1, 'incompatible': 1} + applications (app and app version): + {'{ec8030f7-c20a-464f-9b0e-13a3a9e97384}': {'20.0': 1}} + oses: {'Darwin': 1} + locales: {'en-US': 1} + + The "applications" field is the most complicated to deal with, because it's + a dict of dicts: each key of the dict (the application guid) has a value of + a dict of versions of this application, and the count. + + """ + help = __doc__ + + option_list = BaseCommand.option_list + ( + make_option('--date', action='store', type='string', + dest='date', help='Date in the YYYY-MM-DD format.'), + make_option('--separator', action='store', type='string', default='\t', + dest='separator', help='Field separator in file.'), + ) + + def handle(self, *args, **options): + start = datetime.now() # Measure the time it takes to run the script. + day = options['date'] + if not day: + raise CommandError('You must specify a --date parameter in the ' + ' YYYY-MM-DD format.') + sep = options['separator'] + filename = args[0] + # First, make sure we don't have any existing counts for the same day, + # or it would just increment again the same data. + UpdateCount.objects.filter(date=day).delete() + + # Memoize the addons and the UpdateCounts. + update_counts = {} + # Perf: preload all the addons once and for all. + # This builds a dict where each key (the addon guid we get from the + # hive query) has the addon_id as value. + guids_to_addon = dict(Addon.objects.values_list('guid', 'id')) + + with open(filename) as count_file: + for index, line in enumerate(count_file): + if index and (index % 10000) == 0: + log.info('Processed %s lines' % index) + + splitted = line[:-1].split(sep) + + if len(splitted) != 8: + log.debug('Badly formatted row: %s' % line) + continue + + counter, addon_guid, version, status, app_id, version, \ + app_os, locale, update_type = splitted + try: + counter = int(counter) + except ValueError: # Badly formatted? Drop. + continue + + # The following is magic that I don't understand. I've just + # been told that this is the way we can make sure a request is + # valid: + # > the lower bits for updateType=112 should add to 16, if not, + # > ignore the request. udpateType & 31 == 16 == valid request. + # The 8th column is the updateType this quote is talking about. + try: + if int(update_type) & 31 == 16: + continue + except: + continue + + # We may have several statuses in the same field. + statuses = status.split(',') + + # Does this addon exit? + if addon_guid in guids_to_addon: + addon_id = guids_to_addon[addon_guid] + else: + log.info('Addon with guid: %s not found' % addon_guid) + continue + + # Memoize the UpdateCount. + if addon_guid in update_counts: + uc = update_counts[addon_guid] + else: + uc = UpdateCount(date=day, addon_id=addon_id, count=0) + update_counts[addon_guid] = uc + + # We can now fill the UpdateCount object. + uc.count += counter + uc.versions = update_inc(uc.versions, version, counter) + + # Applications is a dict of dicts, eg: + # {"{ec8030f7-c20a-464f-9b0e-13a3a9e97384}": # Firefox. + # {"10.0": 2, "21.0": 1, ....}, + # "some other application guid": ... + # } + if uc.applications is None: + uc.applications = {} + app = uc.applications.get(app_id, {}) + # Now overwrite this application's dict with incremented + # counts for its versions. + uc.applications.update( + {app_id: update_inc(app, version, counter)}) + + uc.oses = update_inc(uc.oses, app_os, counter) + uc.locales = update_inc(uc.locales, locale, counter) + + # We may have received a list of more than one status. + for status in statuses: + uc.statuses = update_inc(uc.statuses, status, counter) + + # Create in bulk: this is much faster. + UpdateCount.objects.bulk_create(update_counts.values(), 100) + total_time = (datetime.now() - start).total_seconds() + log.info('Processed a total of %s lines' % (index + 1)) + log.debug('Total processing time: %s seconds' % total_time) diff --git a/apps/stats/models.py b/apps/stats/models.py index 544264a64494..11f7f9ebd518 100644 --- a/apps/stats/models.py +++ b/apps/stats/models.py @@ -22,6 +22,19 @@ from .db import StatsDictField +# This helps us increment counters in dicts easily, for StatsDictFields. +def update_inc(initial, key, count): + """Update and increment the initial dict with "add". + + If the key isn't in "initial", add the new key with a value of 1. + If the key is not already in "initial", increment the current value. + + """ + initial = initial or {} + initial[key] = count or initial.get(key, 0) + 1 + return initial + + class AddonCollectionCount(models.Model): addon = models.ForeignKey('addons.Addon') collection = models.ForeignKey('bandwagon.Collection') @@ -62,6 +75,17 @@ class Meta: db_table = 'download_counts' +# TODO: remove when the script is proven to work correctly. +class DownloadCountTmp(SearchMixin, models.Model): + addon = models.ForeignKey('addons.Addon') + count = models.PositiveIntegerField() + date = models.DateField() + sources = StatsDictField(db_column='src', null=True) + + class Meta: + db_table = 'download_counts_tmp' + + class UpdateCount(SearchMixin, models.Model): addon = models.ForeignKey('addons.Addon') count = models.PositiveIntegerField() @@ -76,6 +100,21 @@ class Meta: db_table = 'update_counts' +# TODO: remove when the script is proven to work correctly. +class UpdateCountTmp(SearchMixin, models.Model): + addon = models.ForeignKey('addons.Addon') + count = models.PositiveIntegerField() + date = models.DateField() + versions = StatsDictField(db_column='version', null=True) + statuses = StatsDictField(db_column='status', null=True) + applications = StatsDictField(db_column='application', null=True) + oses = StatsDictField(db_column='os', null=True) + locales = StatsDictField(db_column='locale', null=True) + + class Meta: + db_table = 'update_counts_tmp' + + class AddonShareCount(models.Model): addon = models.ForeignKey('addons.Addon') count = models.PositiveIntegerField() diff --git a/migrations/770-temp-stats-tables.sql b/migrations/770-temp-stats-tables.sql new file mode 100644 index 000000000000..f3e45a202752 --- /dev/null +++ b/migrations/770-temp-stats-tables.sql @@ -0,0 +1,2 @@ +CREATE TABLE `download_counts_tmp` LIKE `download_counts`; +CREATE TABLE `update_counts_tmp` LIKE `update_counts`; diff --git a/requirements/prod.txt b/requirements/prod.txt index 77c3a60e77b1..e89642875716 100644 --- a/requirements/prod.txt +++ b/requirements/prod.txt @@ -68,6 +68,8 @@ newrelic==2.16.0.12 oauth2==1.5.211 oauthlib==0.4.0 ordereddict==1.1 +phpserialize==1.3 +pyhs2==0.4.1 polib==1.0.3 protobuf==2.5.0 pyasn1==0.1.7 @@ -88,6 +90,7 @@ recaptcha-client==1.0.6 receipts==0.2.9 redis==2.8.0 requests==2.0.0 +sasl==0.1.3 schematic==0.2 #signing_clients==0.1.7 six==1.4.1 @@ -95,6 +98,7 @@ slumber==0.5.3 SQLAlchemy==0.7.5 statsd==2.0.3 suds==0.4 +thrift==0.9.1 ## Not on pypi. -e git+https://github.com/mozilla/amo-validator.git@cae9bafc2db2fe7e4f25f9616945ee33479332f2#egg=amo-validator diff --git a/scripts/crontab/crontab.tpl b/scripts/crontab/crontab.tpl index 4010ecf958df..2d8251ee110a 100644 --- a/scripts/crontab/crontab.tpl +++ b/scripts/crontab/crontab.tpl @@ -3,6 +3,7 @@ MAILTO=amo-developers@mozilla.org HOME=/tmp +YESTERDAY=$(date --date="yesterday" +"%Y-%m-%d") # Every minute! * * * * * %(z_cron)s fast_current_version @@ -50,6 +51,11 @@ HOME=/tmp # Collect visitor stats from Google Analytics once per day. 50 10 * * * %(z_cron)s update_google_analytics +# Update ADI metrics from HIVE. +00 1 * * * %(z_cron)s download_metrics --date $YESTERDAY --with-updates --with-downloads --output adi_data +00 3 * * * %(z_cron)s update_counts_from_file adi_data.updates --date $YESTERDAY +00 4 * * * %(z_cron)s download_counts_from_file adi_data.downloads --date $YESTERDAY + #Once per day after 2100 PST (after metrics is done) 35 5 * * * %(z_cron)s update_addon_download_totals 40 5 * * * %(z_cron)s weekly_downloads