From c70c507e6ff58eea2fcab5f5b3fb7ad04463ac63 Mon Sep 17 00:00:00 2001
From: Rob Savoye <rob@senecass.com>
Date: Sun, 25 Feb 2024 11:09:06 -0700
Subject: [PATCH] fix: Refactor to create a list of SQL queries, instead of the
 data

---
 tm_admin/tasks/tasks.py | 201 ++++++++++++++--------------------------
 1 file changed, 70 insertions(+), 131 deletions(-)

diff --git a/tm_admin/tasks/tasks.py b/tm_admin/tasks/tasks.py
index afad99d9..504cbe56 100755
--- a/tm_admin/tasks/tasks.py
+++ b/tm_admin/tasks/tasks.py
@@ -62,100 +62,9 @@ async def updateThread(
         queries (list): The list of SQL queries to execute
         db (PostgresClient): A database connection
     """
-    # pbar = tqdm.tqdm(queries)
+    pbar = tqdm.tqdm(queries)
+    for sql in pbar:
     # for sql in queries:
-    # for sql in pbar:
-    print(sql)
-    result = await db.execute(sql)
-
-    return True
-
-async def historyThread(
-    data: list,
-    db: PostgresClient,
-    table: str = "tasks",
-):
-    """Thread to handle importing
-
-    Args:
-        data (list): The list of records to import
-        db (PostgresClient): A database connection
-        table (str): The table to update
-    """
-    # pbar = tqdm(data)
-    for entry in data:
-        # there is only one entry if using row_to_json()
-        id = entry['id']
-        uid = entry['user_id']
-        pid = entry['project_id']
-        tid = entry['task_id']
-        action = entry['action']
-        date = entry['action_date']
-        # Remove embedded single quotes
-        text = str()
-        if entry['action_text'] is None:
-            text = "NULL"
-        else:
-            text = entry['action_text'].replace("'", "")
-        timestamp = str(entry['action_date'])
-        # timestamp = "{%Y-%m-%dT%H:%M:%S}".format(date)
-        # timestamp = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
-        # entry['action_date'] = timestamp
-        # FIXME: currently the schema has this as an int, it's actully an enum
-        func = eval(f"Taskaction.{action}")
-        # columns = f"id, project_id, history.action, history.action_text, history.action_date, history.user_id"
-        # nested = f"{record['id']}, {record['project_id']}, {func.value}, '{text}', '{timestamp}', {record['user_id']}"
-        sql = f"UPDATE tasks "
-        sql += f" SET history=history||({pid}, {tid}, {func.value}, '{text}', '{timestamp}', {uid})::task_history"
-        # sql += f" SET history = (SELECT ARRAY_APPEND(history,({func.value}, '{text}', '{timestamp}', {entry['user_id']})::task_history)) "
-        sql += f" WHERE id={entry['task_id']} AND project_id={entry['project_id']}"
-        print(f"{sql};")
-        #try:
-        result = db.execute(sql)
-        #except:
-        #    log.error(f"Couldn't execute query! '{sql}'")
-
-    return True
-
-async def invalidationThread(
-    data: list,
-    db: PostgresClient,
-):
-    """Thread to handle importing
-
-    Args:
-        data (list): The list of records to import
-        db (PostgresClient): A database connection
-    """
-    pbar = tqdm.tqdm(data)
-    for record in pbar:
-        map_timestamp = "NULL"
-        inval_timestamp = "NULL"
-        up_timestamp = "NULL"
-        val_timestamp = "NULL"
-        date = record['mapped_date']
-        if date is not None:
-            map_timestamp = "'{:%Y-%m-%dT%H:%M:%S}'".format(date)
-        date = record['invalidated_date']
-        if date is not None:
-            inval_timestamp = "'{:%Y-%m-%dT%H:%M:%S}'".format(date)
-        date = record['updated_date']
-        if date is not None:
-            up_timestamp = "'{:%Y-%m-%dT%H:%M:%S}'".format(date)
-        date = record['validated_date']
-        if date is not None:
-            val_timestamp = "'{:%Y-%m-%dT%H:%M:%S}'".format(date)
-
-        vid = "NULL"
-        if record['validator_id'] is not None:
-            vid = record['validator_id']
-
-        # columns = f"is_closed, mapper_id, mapped_date, invalidator_id, invalidated_date, invalidation_history_id, validator_id, validated_date, updated_date"
-
-        sql = f"UPDATE tasks"
-        # sql += f"  SET invalidation_history = (SELECT ARRAY_APPEND(invalidation_history,({record['is_closed']}, {record['mapper_id']}, {map_timestamp}, {record['invalidator_id']}, {inval_timestamp}, {record['invalidation_history_id']}, {vid}, {val_timestamp}, {up_timestamp})::task_invalidation_history)) "
-        sql += f"  SET invalidation_history = invalidation_history||({record['is_closed']}, {record['mapper_id']}, '{record['mapped_date']}', {record['invalidator_id']}, '{record['invalidated_date']}', {record['invalidation_history_id']}, {record['validator_id']}, '{record['validated_date']}', '{record['updated_date']}')::task_invalidation_history"
-        sql += f"WHERE id={record['task_id']} AND project_id={record['project_id']}"
         # print(sql)
         result = await db.execute(sql)
 
@@ -177,22 +86,16 @@ def __init__(self,
         self.profile = TasksTable()
         super().__init__('tasks')
 
-    async def mergeIssues(self,
-                        inpg: PostgresClient,
-                        ):
-        table = "task_mapping_issues"
-        log.error(f"mergeIssues() Unimplemented!")
-        timer = Timer(initial_text=f"Merging {table} table...",
-                      text="merging table took {seconds:.0f}s",
-                      logger=log.debug,
-                    )
-        log.info(f"Merging {table} table...")
-
     async def mergeAnnotations(self,
                         inpg: PostgresClient,
                         ):
-        table = "task_annotationstask_annotations"
-        log.error(f"mergeAnnotations() nimplemented!")
+        """
+        Merge the task_annotation table from Tasking Manager into
+        TM Admin. This table doesn't actually appear to be currently
+        used by TM at all.
+        """
+        table = "task_annotations"
+        log.error(f"mergeAnnotations() Unimplemented as the source is empty!")
         timer = Timer(initial_text="Merging {table} table...",
                       text="merging {table table took {seconds:.0f}s",
                       logger=log.debug,
@@ -218,10 +121,11 @@ async def mergeAuxTables(self,
         # FIXME: in TM, this table is empty
         # await self.mergeAnnotations(inpg)
 
-        # await self.mergeHistory(inpg)
+        await self.mergeHistory(inpg)
 
         await self.mergeInvalidations(inpg)
 
+        # This is now handled by mergeHistory
         # await self.mergeIssues(inpg)
 
     async def mergeHistory(self,
@@ -233,43 +137,79 @@ async def mergeHistory(self,
         """
         table = 'task_history'
         timer = Timer(initial_text=f"Merging {table} table...",
-                        # text=f"merging {table} table took {seconds:.0f}s",
-                      logger=log.debug,
+                        text="merging table took {seconds:.0f}s",
+                        logger=log.debug,
                     )
         log.info(f"Merging {table} table...")
-        # pg = PostgresClient()
-        # await pg.connect('localhost/tm4')
-        # sql = f"SELECT MIN(project_id),MAX(project_id) FROM task_history"
-        # Get the number of records
-        # sql = f"SELECT reltuples::bigint AS estimate FROM  pg_class WHERE oid = 'public.task_history'::regclass;"
-        # entries = await pg.getRecordCount(table)
+        timer.start()
 
+        # There is a small amount of data in this table, and we need to
+        # coorelate it to the task history when merging, so read in
+        # the entire dataset.
+        sql = f"SELECT * FROM task_mapping_issues ORDER BY id;"
+        # print(sql)
+        data = await inpg.execute(sql)
+        entries = len(data)
+        log.debug(f"There are {len(data)} records in task_mapping_issues")
+        issues = dict()
+        # pbar = tqdm.tqdm(data)
+        # for record in pbar:
+        for record in data:
+            hid = record['task_history_id']
+            issues[hid] = {'issue': record['issue'],
+                           'category': record['mapping_issue_category_id'],
+                           'count': record['count'],
+                           }
+
+        # Now get the data from the history table
         sql = f"SELECT * FROM {table}"
         # print(sql)
-        timer.start()
         data = await inpg.execute(sql)
         entries = len(data)
         log.debug(f"There are {len(data)} records in {table}")
-        timer.stop()
 
         chunk = round(entries/cores)
-        blocks = list()
-        # Some tables in the input database are huge, and can either core
-        # dump python, or have performance issues. Past a certain threshold
-        # the data needs to be queried in pages instead of the entire table
-        # This is a huge table, we can't read in the entire thing
 
+        # FIXME: create an array of SQL queries, so later we can use
+        # prepared_queries in asyncpg for better performance. We also don't
+        # need all of the columns from the TM table, since task ID and
+        # project ID are already part of the table schema.
+        queries = list()
+        # pbar = tqdm.tqdm(data)
+        #for record in pbar:
+        for record in data:
+            entry = {"user_id": record['user_id']}
+            # entry['action'] = Taskaction(record['action']).name
+            entry['action'] = record['action']
+            entry['action_text'] = record['action_text']
+            if record['action_date']:
+                entry['action_date'] = '{:%Y-%m-%dT%H:%M:%S}'.format(record['action_date'])
+            # If there is an issue, add it to the record in the jsonb column
+            if record['id'] in issues:
+                entry.update(issues[record['id']])
+                # entry['issue'] = issues['issue']
+                # entry['category'] = issues['category']
+                # entry['count'] = issues['count']
+            asc = str(entry).replace("'", '"').replace("\\'", "'")
+            sql = "UPDATE tasks SET history = '{\"history\": [%s]}' WHERE id=%d AND project_id=%d" % (asc, record['task_id'], record['project_id'])
+            # print(sql)
+            queries.append(sql)
+
+        entries = len(queries)
+        chunk = round(entries/cores)
+        import copy
         async with asyncio.TaskGroup() as tg:
             for block in range(0, entries, chunk):
                 # for index in range(0, cores):
                 outpg = PostgresClient()
+                # FIXME: this should not be hard coded
                 await outpg.connect('localhost/tm_admin')
-                log.debug(f"Dispatching thread {block}:{block + chunk}")
-                # await licensesThread(data, outpg)
-                await historyThread(data[block:block + chunk], outpg)
-                # task = tg.create_task(historyThread(data[block:block + chunk], outpg))
-
-        # result = historyThread(data, adminpg[index], f"{table}{index}_view")
+                foo = copy.copy(queries[block:block + chunk -1])
+                log.debug(f"Dispatching thread {block}:{block + chunk - 1}")
+#                await updateThread(foo, outpg)
+                # await updateThread(queries[block:block + chunk], outpg)
+                task = tg.create_task(updateThread(foo, outpg))
+        timer.stop()
 
     async def mergeInvalidations(self,
                         inpg: PostgresClient,
@@ -320,9 +260,8 @@ async def mergeInvalidations(self,
                 entry["is_closed_id"] = "false"
             # entries[record['task_id']].append(entry)
             asc = str(entry).replace("'", '"').replace("\\'", "'")
-            # UPDATE tasks SET invalidation_history = '{"history": [{"user_id": 35, "mapper_id": 11593853, "invalidator_id": 11596055}]}' WHERE id=35 AND project_id=105;
             sql = "UPDATE tasks SET invalidation_history = '{\"history\": [%s]}' WHERE id=%d AND project_id=%d" % (asc, record['task_id'], record['project_id'])
-            print(sql)
+            # print(sql)
             queries.append(sql)
 
         entries = len(queries)
@@ -335,7 +274,7 @@ async def mergeInvalidations(self,
                 await outpg.connect('localhost/tm_admin')
                 log.debug(f"Dispatching thread {block}:{block + chunk}")
                 #await updateThread(queries[block:block + chunk], outpg)
-                task = tg.create_task(updateThread(data[block:block + chunk], outpg))
+                task = tg.create_task(updateThread(queries[block:block + chunk], outpg))
 
 async def main():
     """This main function lets this class be run standalone by a bash script."""