rag-annotation-tool/data_manager.py at main · hltcoe/rag-annotation-tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
from typing import Iterable, Set, Tuple, List, Dict, Literal, Mapping, Union
from pathlib import Path

import streamlit as st

import sqlite3
import pandas as pd
import io
import zipfile
import json
import pickle
from copy import deepcopy
from hashlib import md5

from task_resources import TaskConfig

import ir_datasets as irds

# try:
import datasets as hfds
# except ImportError as e:
#     hfds = None

from tqdm import tqdm

class SqliteManager:

    def __init__(self, db_path: str, persistent_connection: bool = True):
        self.db_path = str(db_path)
        self.persistent_connection = persistent_connection

        self._conn = None

    @property
    def conn(self):
        if self.persistent_connection:
            if self._conn is None:
                self._conn = sqlite3.connect(self.db_path)
            return self._conn
        return sqlite3.connect(self.db_path)

    def table_exists(self, table_name: str):
        return len(self.execute_simple(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")) > 0

    def execute_simple(self, query: str, args = None, query_only: bool = None, conn: sqlite3.Connection=None):
        if conn is None:
            conn = self.conn

        if query_only is None:
            query_only = query.lower().startswith('select')

        try:
            query = query.strip()
            if args:
                cursor = conn.execute(query, args)
            else:
                cursor = conn.execute(query)

            if query_only:
                return cursor.fetchall()

            conn.commit()
        except sqlite3.OperationalError:
            st.error("Database error. Try again later.")


class ActivityLogMananger(SqliteManager):

    def __init__(self, db_path: str, username: str):
        super().__init__(db_path, persistent_connection=False)
        self.username = username

        if not self.table_exists('logs'):
            self.execute_simple("""create table if not exists logs (username string, query string, args string, ts datetime default current_timestamp);""")

    def log(self, query: str, args = None):
        query = query.strip()

        print("[LOG] ", query.replace("\n", '  '), args)

        if args is not None:
            args = json.dumps(args)
            self.execute_simple("""insert into logs (username, query, args) values (?, ?, ?)""", (self.username, query, args))
        else:
            self.execute_simple("""insert into logs (username, query) values (?, ?)""", (self.username, query))


class NuggetSet:

    def __init__(self):
        self.nugget_list: List[Tuple[str, Dict[str, Set[str]]]] = []
        # [ (question, { answer: {doc_id...} ... }), ... ]
        self.group_assignment: Dict[str, str] = {}
        # {question: group}

    def get(self, question: str, default=None, only_answers: bool=False):
        question = question.strip()
        for q, a_dict in self.nugget_list:
            if q == question:
                return a_dict if not only_answers else a_dict.keys()

        return default

    @property
    def groups(self):
        return sorted(set(self.group_assignment.values())) + ["default"]

    def __len__(self):
        return len(self.nugget_list)

    def __contains__(self, key: str):
        return self.get(key) is not None

    def __getitem__(self, idx: int):
        return self.nugget_list[idx]

    def set_group(self, nq: str, group: str):
        assert nq in self
        assert group is not None
        if group == "default":
            del self.group_assignment[nq]
        else:
            self.group_assignment[nq] = group

    def get_group(self, nq: str):
        return self.group_assignment.get(nq, "default")

    def rename_group(self, old_name: str, new_name: str):
        assert old_name in self.groups and old_name != "default" and new_name != "default"
        self.group_assignment = {
            nq: gp if gp != old_name else new_name
            for nq, gp in self.group_assignment.items()
        }

    def iter_grouped_nuggets(self):
        inverted_group = { g: [] for g in self.groups }
        for idx, (nq, a_dict) in enumerate(self.nugget_list):
            if nq in self.group_assignment:
                inverted_group[self.group_assignment[nq]].append((idx, nq, a_dict))
        inverted_group['default'] = [
            (idx, nq, a_dict)
            for idx, (nq, a_dict) in enumerate(self.nugget_list)
            if nq not in self.group_assignment
        ]

        for group in self.groups: # sort by group name
            # yield group, sorted(inverted_group[group], key=lambda x: x[1]) # sort by question
            yield group, inverted_group[group]

    def iter_nuggets(self, only_answers: bool=False):
        yield from (
            (nidx, question, a_dict.keys() if only_answers else a_dict)
            for nidx, (question, a_dict) in enumerate(self.nugget_list)
        )

    def get_all_questions(self):
        return [
            q for q, _ in self.nugget_list
        ]

    def rewrite_question(self, old_question: str, new_question: str):
        assert old_question in self

        if new_question not in self:
            for i in range(len(self.nugget_list)):
                if self.nugget_list[i][0] == old_question:
                    self.nugget_list[i] = (new_question, self.nugget_list[i][1])
                    break
        else: # need merging
            target_a_dict = self.get(new_question)
            for old_a, old_doc_set in self.get(old_question).items():
                if old_a in target_a_dict:
                    target_a_dict[old_a] |= old_doc_set
                else:
                    target_a_dict[old_a] = old_doc_set
            self.nugget_list.remove((old_question, self.get(old_question)))

        if old_question in self.group_assignment:
            self.group_assignment[new_question] = self.group_assignment[old_question]
            del self.group_assignment[old_question]

    def remove_question(self, question: str):
        assert question in self

        self.nugget_list.remove( (question, self.get(question)) )
        del self.group_assignment[question]

    def add(self, question: str, doc_answer_pairs: Iterable[Tuple[str, str]]):
        question = question.strip()
        # doc_answer_pairs = [ (d, a.strip()) for d, a in doc_answer_pairs ]
        answer_new_dict: Dict[str, Set[str]] = {}
        for doc_id, answer in doc_answer_pairs:
            if answer not in answer_new_dict:
                answer_new_dict[answer] = set()
            answer_new_dict[answer].add(doc_id)


        if question in self:
            for answer, doc_set in answer_new_dict.items():
                existing_answer_dict = self.get(question)
                if answer in existing_answer_dict:
                    existing_answer_dict[answer] |= doc_set
                else:
                    existing_answer_dict[answer] = doc_set

        else:
            self.nugget_list.append(
                (question, answer_new_dict)
            )

    def remove(self, question: str, doc_id: str, answers: List[str]):
        assert question in self
        for answer in answers:
            assert answer in self.get(question)
            assert doc_id in self.get(question)[answer]

            self.get(question)[answer].remove(doc_id)

    def remove_answer(self, question: str, answer: str):
        assert question in self
        a_dict = self.get(question)
        assert answer in a_dict
        del a_dict[answer]

    def rewrite_answer(self, question: str, old_answer: str, new_answer: str):
        if old_answer == new_answer:
            return

        assert question in self
        a_dict = self.get(question)
        assert old_answer in a_dict

        if new_answer not in a_dict:
            a_dict[new_answer] = set()
        a_dict[new_answer] |= a_dict[old_answer]
        del a_dict[old_answer]

    def clone(self):
        new_nugget_set = self.__class__()
        new_nugget_set.nugget_list = deepcopy(self.nugget_list)
        new_nugget_set.group_assignment = deepcopy(self.group_assignment)

        return new_nugget_set

    def __add__(self, obj: 'NuggetSet'):
        new_nugget_set = self.clone()

        for q, a_dict in obj.nugget_list:
            if q in new_nugget_set:
                for answer, doc_set in a_dict.items():
                    if answer in new_nugget_set.get(q):
                        new_nugget_set.get(q)[answer] |= doc_set
                    else:
                        new_nugget_set.get(q)[answer] = doc_set
            else:
                new_nugget_set.nugget_list.append((q, a_dict))

        new_nugget_set.group_assignment = { **self.group_assignment, **obj.group_assignment }

        return new_nugget_set

    def doc_has_nugget(self, doc_id: str):
        for q, a_dict in self.nugget_list:
            for d in a_dict.values():
                if doc_id in d:
                    return True

        return False

    # def _as_nugget_dict(self, only_answers: bool = False):
    #     return {
    #         q: sorted(a_dict.keys()) if only_answers else a_dict
    #         for q, a_dict in self.nugget_list
    #     }

    def as_json(self, indent=None):
        return json.dumps({
            'nugget_list': [
                (q, { a: list(doc_set) for a, doc_set in a_dict.items() })
                for q, a_dict in self.nugget_list
            ],
            'group_assignment': {
                nq: gp
                for nq, gp in self.group_assignment.items() if gp != "default"
            }
        }, indent=indent)

    def as_dataframe(self):
        return pd.DataFrame({
            'Question': [ q for q, _ in self.nugget_list ],
            'Answers': [ "; ".join(sorted(a_dict.keys())) for _, a_dict in self.nugget_list ]
        }).astype(str)

    @classmethod
    def from_list(cls, nugget_list: List[Tuple[str, Dict[str, List[str]]]], group_assignment: Dict[str, str] = {}):
        ret = cls()
        ret.nugget_list = [
            (question, { answer: set(doc_ids) for answer, doc_ids in a_dict.items() })
            for question, a_dict in nugget_list
        ]

        # assert len(group_assignment.keys() - set([ q for q, _ in ret.nugget_list ])) == 0
        ret.group_assignment = group_assignment

        return ret

    @classmethod
    def from_json(cls, json_string: str):
        content = json.loads(json_string)
        # if "nugget_dict" not in json_dict:
        #     json_dict = {"nugget_dict": json_dict}

        if "nugget_list" not in content:
            #  deprecated
            if "nugget_dict" not in content:
                content = {'nugget_dict': content}
            content['nugget_list'] = list(content['nugget_dict'].items())
            del content['nugget_dict']

        return cls.from_list(**content)


class NuggetSelection(set):

    def __init__(self, selections: Set[Tuple[str, str]] = None):
        super().__init__(selections or [])

    @classmethod
    def from_json(cls, json_string: str):
        if json_string is None:
            return cls()
        return cls(map(tuple, json.loads(json_string)))

    def as_json(self):
        return json.dumps(sorted(self))

    def as_dataframe(self):
        return pd.DataFrame(sorted(self), columns=['Question', 'Answer'])


class NuggetLoader(SqliteManager):

    def __init__(
            self,
            username: str,
            db_path: str = None, load_dir: str = None,
            use_json: bool=True,
            combine_nuggets_from_multiple_users: bool=True,
            use_revised_nugget_only: bool=True
        ):
        assert (db_path is not None) or (load_dir is not None)
        if use_json:
            assert load_dir is not None

        super().__init__(db_path, persistent_connection=False)
        self.load_dir = Path(load_dir)

        self.username = username
        self.use_json = use_json
        self.combine_nuggets_from_multiple_users = combine_nuggets_from_multiple_users
        self.use_revised_nugget_only = use_revised_nugget_only

    def iter_nugget_sets_from_json(
            self, topic_id: str,
            use_revised_nugget_only: bool=None, combine_nuggets_from_multiple_users: bool=None
        ):
        use_revised_nugget_only = use_revised_nugget_only \
            if use_revised_nugget_only is not None else self.use_revised_nugget_only
        combine_nuggets_from_multiple_users = combine_nuggets_from_multiple_users \
            if combine_nuggets_from_multiple_users is not None else self.combine_nuggets_from_multiple_users

        if use_revised_nugget_only:
            fns = self.load_dir.glob(f"nuggets_{topic_id}.revised.json")
        else:
            fns = self.load_dir.glob(f"nuggets_{topic_id}_{"*" if combine_nuggets_from_multiple_users else self.username}.json")

        yield from map(lambda fn: NuggetSet.from_json(fn.read_text()), fns)

    def iter_nuggest_sets_from_db(self, topic_id: str, combine_nuggets_from_multiple_users: bool=None):
        combine_nuggets_from_multiple_users = combine_nuggets_from_multiple_users \
            if combine_nuggets_from_multiple_users is not None else self.combine_nuggets_from_multiple_users

        if not combine_nuggets_from_multiple_users:
            records = self.execute_simple(
                """select nugget_json from nuggets where topic_id = ? and username = ?;""",
                (topic_id, self.username)
            )
        else:
            records = self.execute_simple(
                """select nugget_json from nuggets where topic_id = ?;""", (topic_id, )
            )

        yield from map(NuggetSet.from_json, records)

    def get(self, topic_id: str, source: str=None) -> NuggetSet:

        use_json = self.use_json
        use_revised_nugget_only = None
        if source == 'revised':
            use_json = True
            use_revised_nugget_only = True
        elif source == 'db':
            use_json = False
            use_revised_nugget_only = None
        elif source == 'preload':
            return NuggetSet.from_json( (self.load_dir / f"nuggets_{topic_id}.preload.json").read_text() )

        return sum(
            (self.iter_nugget_sets_from_json if use_json else self.iter_nuggest_sets_from_db)(topic_id, use_revised_nugget_only=use_revised_nugget_only),
            NuggetSet()
        )

    def __getitem__(self, topic_id: str):
        return self.get(topic_id, source=None)

class NuggetSaverManager(SqliteManager):

    def __init__(self, db_path: str, output_dir: str, log_manager: ActivityLogMananger, is_admin: bool=False):
        super().__init__(db_path, persistent_connection=False)
        self.logger = log_manager
        self.username = self.logger.username
        self.output_dir = Path(output_dir)

        self.topic_nuggets: Dict[str, NuggetSet] = {}

        if not self.table_exists('nuggets'):
            self.execute_simple("""
                create table if not exists nuggets (
                    username string, topic_id string,
                    nugget_json string, ts datetime default current_timestamp
                );
            """)

        if not is_admin:
            existing_nugget_records = self.execute_simple("""select topic_id, nugget_json from nuggets where username = ?;""", (self.username, ))
        else:
            existing_nugget_records = self.execute_simple("""select topic_id, nugget_json from nuggets;""")
        # print(existing_nugget_records)
        for topic_id, nugget_json in existing_nugget_records:
            self.topic_nuggets[str(topic_id)] = NuggetSet.from_json(nugget_json)

        for fn in self.output_dir.glob("nuggets_*.revised.json"):
            topic_id = fn.stem.replace(".revised", "").split("_", 2)[1]
            if topic_id not in self.topic_nuggets:
                self.topic_nuggets[topic_id] = NuggetSet.from_json(fn.read_text())

        for fn in self.output_dir.glob("nuggets_*.preload.json"):
            topic_id = fn.stem.replace(".preload", "").split("_", 2)[1]
            if topic_id not in self.topic_nuggets:
                self.topic_nuggets[topic_id] = NuggetSet.from_json(fn.read_text())

    def __getitem__(self, topic_id: str):
        if topic_id not in self.topic_nuggets:
            self.topic_nuggets[topic_id] = NuggetSet()

        return self.topic_nuggets[topic_id]

    def __contains__(self, topic_id: str):
        return topic_id in self.topic_nuggets

    def flush(self, topic_id: str):
        assert topic_id in self

        sql_query, sql_args = f"""
            insert or replace into nuggets (rowid, username, topic_id, nugget_json) values (
            (select rowid from nuggets where topic_id = "{topic_id}" and username = "{self.username}"), ?, ?, json(?));
        """, (self.username, topic_id, self[topic_id].as_json())

        self.logger.log(sql_query, sql_args)
        self.execute_simple(sql_query, sql_args)
        # also save a text version

        with (self.output_dir / f"nuggets_{topic_id}_{self.username}.json").open("w") as fw:
            fw.write(self[topic_id].as_json(indent=4))

    def save_revised_nugget(self, topic_id: str, nugget_to_save: NuggetSet):
        with (self.output_dir / f"nuggets_{topic_id}.revised.json").open("w") as fw:
            fw.write(nugget_to_save.as_json(indent=4))

    def to_tsv(self, all_data: bool=False):

        return pd.read_sql_query(
            f"select * from nuggets", self.conn
        ).astype(str).sort_values('ts', ascending=False).to_csv(index=False, sep="\t")


def _flatten_dict(obj: Mapping[str, Mapping]):
    for key, val in obj.items():
        if isinstance(val, list):
            # val = { i: v for i, v in enumerate(val) }
            val = { v: "" for v in val }

        if isinstance(val, dict):
            yield from ( ((key, *cum_key), v) for cum_key, v in _flatten_dict(val) )
        else:
            yield (key, ), val

def _multi_level_dict_to_series(obj: Mapping[str, Mapping], names= List[str]):
    return pd.Series(dict(_flatten_dict(obj))).rename_axis(names)


class AnnotationManager(SqliteManager):

    def __init__(
            self, db_path: str,
            output_dir: str, log_manager: ActivityLogMananger,
            table_name: str,
            content_obj: Dict,
            level_names: Tuple[str],
            slot_names: Union[Tuple[str], str]
        ):
            super().__init__(db_path, persistent_connection=False)
            self.logger = log_manager
            self.username = log_manager.username
            self.output_dir = Path(output_dir)
            self.table_name = table_name

            slot_names = (slot_names, ) if isinstance(slot_names, str) else slot_names

            content_df = _multi_level_dict_to_series(content_obj, level_names)
            self.content_df = content_df.rename('content').to_frame().assign(**{
                name: [None]*content_df.shape[0] for name in slot_names
            }).sort_index()

            if not self.table_exists(self.table_name):
                col_string = ", ".join(
                    f"{col} string" for col in self.content_df.index.names
                )
                self.execute_simple(f"""
                    create table if not exists {table_name} (
                        username string, {col_string},
                        slot_name string, annotation string,
                        ts datetime default current_timestamp
                    );
                """)

            record = pd.read_sql_query(
                f"select * from {self.table_name} where username = ?", self.conn, params=(self.username, )
            ).astype(str).sort_values('ts', ascending=False)\
            .groupby(self.content_df.index.names + ['slot_name']).first()\
            ['annotation'].unstack('slot_name')

            for slot in self.slot_names:
                if slot in record.columns:
                    self.content_df.loc[record.index, slot] = record[slot]


    @property
    def slot_names(self):
        return [ s for s in self.content_df.columns if s != 'content' ]

    @property
    def level_names(self):
        return self.content_df.index.names

    def __contains__(self, keys):
        return keys in self.content_df.index

    def __getitem__(self, keys):
        if keys not in self:
            return iter([])

        sel: Union[pd.DataFrame, pd.Series] = self.content_df.loc[keys]
        if isinstance(sel, pd.DataFrame):
            return sel.iterrows()
        return {
            key: NuggetSelection.from_json(val) if key == 'nugget' else val
            for key, val in sel.to_dict().items()
        }

    def is_all_done(self, *keys):
        if keys not in self:
            return True

        d = self.content_df.drop('content', axis=1)
        if 'nugget' in self.slot_names:
            d['nugget'] = d.nugget.replace({"[]": None})

        return not d.loc[keys].isna().any().any().item()

    def count_done(self, *keys, level=None):
        if keys not in self:
            return 0

        d = self.content_df.loc[keys].drop('content', axis=1)
        if 'nugget' in self.slot_names:
            d['nugget'] = d.nugget.replace({"[]": None})

        if level is None:
            return (~d.isna()).sum().sum().item()

        return d.groupby(level).apply(lambda x: ~x.isna().any().any()).sum().item()

    def count_job(self, *keys, level=None):
        if keys not in self:
            return 0

        d = self.content_df
        if 'nugget' in self.slot_names:
            d['nugget'] = d.nugget.replace({"[]": None})

        if level is None:
            return d.loc[keys].drop('content').size

        return d.loc[keys].index.get_level_values(level).unique().size

    def annotate(self, key: List[str], slot: str, annotation):
        assert slot in self.slot_names

        if isinstance(annotation, NuggetSelection):
            annotation = annotation.as_json()

        # prevent update if the value is the same
        if key in self and self.content_df.loc[pd.MultiIndex.from_tuples([key]), slot].iloc[0] == annotation:
            # print(f"-- same value for {key}, skip update")
            return

        self.content_df.loc[pd.MultiIndex.from_tuples([key]), slot] = annotation

        # save to db
        sql_query = f"""
            insert into {self.table_name} ({', '.join(self.level_names)}, slot_name, annotation, username) values
            ({', '.join(['?']*len(self.level_names))}, ?, ?, ?);
        """
        sql_args = (*key, slot, annotation, self.username)

        self.logger.log(sql_query, sql_args)
        self.execute_simple(sql_query, sql_args)

    def to_tsv(self, all_data: bool=False):
        if not all_data:
            return self.content_df.to_csv(sep="\t")

        return pd.read_sql_query(
            f"select * from {self.table_name};", self.conn
        ).astype(str).sort_values('ts', ascending=False).to_csv(index=False, sep="\t")


def session_set_default(session_key, default=None):
    if session_key not in st.session_state:
        st.session_state[session_key] = default if not callable(default) else default()

    return st.session_state[session_key]

def _hash_hfds(ds: hfds.arrow_dataset.Dataset):
    return md5("".join(sorted([ f['filename'] for f in ds.cache_files ])).encode()).hexdigest()

def _get_hfds_id_mapping(ds: hfds.arrow_dataset.Dataset) -> Dict[str, int]:
    cache_fn = Path(ds.cache_files[0]['filename']).parent / f"{_hash_hfds(ds)}.doc_id_mapping.pkl"
    if cache_fn.exists():
        with cache_fn.open('rb') as f:
            data = pickle.load(f)
        return data

    print(f"creating cache at {cache_fn}")
    mapping = { doc['id']: i for i, doc in enumerate(tqdm(ds)) }
    with cache_fn.open('wb') as fw:
        pickle.dump(mapping, fw)

    return mapping

@st.cache_resource
def _get_hfds_ds(ds_id, revision=None, split=None):
    ds = hfds.load_dataset(ds_id, revision=revision, split=split)
    return ds, _get_hfds_id_mapping(ds)


@st.cache_data(ttl=600)
def get_doc_content(service: str, collection_id: str, doc_id: str):
    if service == 'ir_datasets':
        doc = irds.load(collection_id).docs.lookup(doc_id)
        return {
            'title': doc.title if hasattr(doc, 'title') else "",
            'text': doc.default_text()
        }

    elif service == 'hf_datasets' and hfds is not None:
        # <user>/<project>#<branch>:<subset>+...
        for ds_id in collection_id.split('+'):
            ds_id, subset = ds_id.split(":")
            ds_id, revision = ds_id.split('#')
            ds, mapping = _get_hfds_ds(ds_id, revision=revision, split=subset)
            idx = mapping.get(doc_id, None)
            if idx is not None:
                return {
                    'title': ds[idx]['title'] if 'title' in ds[idx] else "",
                    'text': ds[idx]['text']
                }

    return {'title': "", "text": f"Suppose to be {service} {collection_id} // {doc_id}"}


def get_manager(task_config: TaskConfig, username: str, manager_name: str, is_admin=False) -> AnnotationManager:
    output_dir = Path(task_config.output_dir)

    logger = session_set_default(f'{task_config.name}/logger', lambda : ActivityLogMananger(output_dir / "log.db", username))

    if manager_name == "nugget_manager":
        return session_set_default(
            f'{task_config.name}/nugget_manager',
            lambda : NuggetSaverManager(output_dir / "annotation.db", output_dir, logger, is_admin=is_admin)
        )

    if manager_name == "relevance_assessment_manager":
        return session_set_default(
            f'{task_config.name}/{manager_name}',
            lambda : AnnotationManager(
                output_dir / "annotation.db", # could be different
                output_dir, logger,
                table_name="doc_binary_rel",
                content_obj=task_config.pooled_docs,
                slot_names='no_nugget_found',
                level_names=['topic_id', 'doc_id']
            )
        )

    if manager_name == "citation_assessment_manager":
        return session_set_default(
            f'{task_config.name}/{manager_name}',
            lambda : AnnotationManager(
                output_dir / "annotation.db", # could be different
                output_dir, logger,
                table_name="sent2doc",
                content_obj=task_config.cited_sentences,
                slot_names='annot',
                level_names=['topic_id', 'doc_id', 'run_id', 'sent_id']
            )
        )

    if manager_name == "nugget_alignment_manager":
        return session_set_default(
            f'{task_config.name}/{manager_name}',
            lambda : AnnotationManager(
                output_dir / "annotation.db", # could be different
                output_dir, logger,
                table_name="sent2nugget",
                content_obj=task_config.report_runs,
                slot_names=('nugget', ),
                level_names=['topic_id', 'run_id', 'sent_id']
            )
        )

    return st.session_state[f"{task_config.name}/{manager_name}"]

def export_data(
        task_config: TaskConfig, username: str, manager_names: List[str],
        with_revised_nuggets: bool=True, with_annotator_nuggets: bool=False
    ):
    managers = {
        name.replace("_manager", ""): get_manager(task_config, username, name)
        for name in manager_names
    }

    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w") as fw:
        for name, manager in managers.items():
            fw.writestr(f"{name}.tsv", manager.to_tsv(all_data=True))

        if with_revised_nuggets:
            for fn in Path(task_config.output_dir).glob("nuggets_*.revised.json"):
                fw.writestr(fn.name, fn.read_text())

        if with_annotator_nuggets:
            for fn in Path(task_config.output_dir).glob("nuggets_*_*.json"):
                fw.writestr(fn.name, fn.read_text())

    return zip_buffer


def get_nugget_loader(
        task_config: TaskConfig, username: str=None,
        from_all_users: bool=None, use_revised_nugget: bool=None
    ):
    if use_revised_nugget is None:
        use_revised_nugget = task_config.use_revised_nugget_only
    if from_all_users is None:
        from_all_users = task_config.combine_nuggets_from_multiple_users

    output_dir = Path(task_config.output_dir)
    return NuggetLoader(
        username=username, db_path=output_dir / "annotation.db",
        load_dir=output_dir,
        use_json=(task_config.load_nugget_from == 'json'),
        combine_nuggets_from_multiple_users=from_all_users,
        use_revised_nugget_only=use_revised_nugget
    )