-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
145 lines (131 loc) · 7.21 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
<!doctype html>
<html lang="en">
<head>
<!-- Required meta tags -->
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<!-- Bootstrap CSS -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
<title>Metadata for image, video and audio captioning training datasets</title>
</head>
<body style="background-color: #f5f5dc">
<div class = "body_title">
<nav class="navbar navbar-dark bg-dark">
<a class="navbar-brand" href="#" style="white-space: normal" >Metadata for image, video and audio captioning training datasets</a>
</nav>
</div>
<div class="body_content" style="margin:20px; text-align: justify">
<p>In project LINDA (Using Language to Interpret Nonstructured Data) we conducted a metadata analysis of training datasets used in automatic captioning. We provide the metadata of 66 image, video and audio captioning training datasets, which can be used for finding suitable datasets and for other research purposes. </p>
<p><a href="Captioning%20datasets%20metadata.xlsx" target="_blank">This file</a> includes a list of captioning training datasets alongside information about their compilation principles (e.g., source data, number of captions). The file includes a codebook with information about the variables. For details regarding selection criteria for the initial list, please see Hekanaho, Hirvonen & Virtanen (forthcoming). The metadata file is available for research purposes and can be amended (updated versions to follow). Please cite the following article if the metadata is used for research.</p>
<p>Citation: Hekanaho, L., Hirvonen, M. & Virtanen, T. (forthcoming). Language-based machine perception: Linguistic perspectives on the compilation of captioning datasets. Digital Scholarship in the Humanities. </p>
<p> <b>Contact: </b> <br>
<a href="https://researchportal.helsinki.fi/en/persons/laura-hekanaho" target="_blank">Laura Hekanaho </a> laura.hekanaho@helsinki.fi <br>
<a href="https://www.tuni.fi/fi/maija-hirvonen" target="_blank">Maija Hirvonen </a> maija.hirvonen@tuni.fi <br>
<a href="https://homepages.tuni.fi/tuomas.virtanen/" target="_blank">Tuomas Virtanen</a> tuomas.virtanen@tuni.fi </p>
</div>
<div class = "body_table" id="dataset-table" style="margin:20px; text-align: justify; display: inline-block;">
<h4> Overview of datasets </h4>
<table class="table" style="display: inline-table;">
<thead class="thead-dark">
<tr>
<th >#</th> <th>Dataset name</th> <th>Modality</th> <th>Language</th>
</tr>
</thead>
<tbody>
<!-- rows will be added from js since there are too many to add manually!-->
</tbody>
</table>
</div>
<!-- Optional JavaScript -->
<!-- jQuery first, then Popper.js, then Bootstrap JS -->
<script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha384-KJ3o2DKtIkvYIK3UENzmM7KCkRr/rE9/Qpg6aAZGJwFDMVNA/GpGFF93hXpG5KkN" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
<script>
const datasetData = [
["ActivityNet Captions", "video", "EN"],
["ActivityNet Entities", "video", "EN"],
["AudioCaps", "audio", "EN"],
["BreakingNews", "image", "EN"],
["CapGaze (includes capgaze1 and capgaze2)", "image", "EN"],
["Conceptual Captions","image","EN"],
["CC12M","image","EN"],
["Charades","video", "EN"],
["Clotho", "audio","EN"],
["Concadia", "image", "EN"],
["Crossmodal-3600", "image", "36 languages"],
["DenseCap", "image", "EN"],
["Flickr30k", "image", "EN"],
["Flickr8k", "image", "EN"],
["FlickrStyle10K", "image (+caption)", "EN"],
["PASCAL", "image", "EN"],
["Flickr8k-CN", "image", "EN, CN"],
["Multi30k 2016", "image", "EN, DE"],
["Multi30k 2017 (caption information for French)", "image", "EN, FR, (DE)"],
["Multi30k 2018 (caption information for Czech)", "image", "EN, CZE, (DE, FR)"],
["Dutch Descriptions to Flickr30k", "image", "EN, NL (DE)"],
["GoodNews", "image", "EN"],
["GoogleRefExp", "image", "EN"],
["Hephaestus", "image", "EN"],
["How2R", "video", "EN"],
["HowTo100M", "video", "EN"],
["Image Paragraph Captioning", "image", "EN"],
["InFashAI", "image", "EN"],
["Kinetics-GEB+", "video", "EN"],
["Laion-5B", "image", "EN, RU, FR, DE, ES, CN"],
["Localized Narratives", "image", "EN"],
["MACS", "audio", "EN"],
["MMAC Captions", "video", "EN"],
["MAD", "video", "EN"],
["MPII-MD", "video", "EN"],
["MS COCO Captions (includes MS COCO c5 and MS COCO c40)", "image", "EN"],
["DeCOCO", "image", "EN, DE"],
["COCO-CN", "image", "EN, CN"],
["MIC test data", "image", "EN, DE, FR"],
["UIT-ViLC", "image", "EN, VI"],
["MSR-VTT-10K", "video", "EN"],
["MSVD", "video", "EN"],
["M-VAD", "video", "EN"],
["nocaps", "image", "EN"],
["NSVA", "video", "EN"],
["PoseScript", "image", "EN"],
["RedCaps", "image", "EN"],
["ReferItGame", "image", "EN"],
["RPCD", "image", "EN"],
["RSICD", "image", "EN"],
["SCICAP", "image", "EN"],
["SentiCap", "image (+caption)", "EN"],
["TACoS (original)", "video", "EN"],
["TACoS Multi-level", "video", "EN"],
["TextCaps", "image", "EN"],
["TGIF (Tumblr GIF)", "video", "EN"],
["VATEX", "video", "EN, CN"],
["Violin", "video", "EN"],
["Visual Genome", "image", "EN"],
["VizWiz-Captions", "image", "EN"],
["WebLI", "image", "109 languages"],
["WebVid-2M", "video", "EN"],
["WikiScenes", "image", "EN, FR, ES"],
["Winoground", "image", "EN"],
["WIT", "image", "108 languages"],
["YouCook", "video", "EN"],
["YouCook2", "video", "EN"],
["YJ Captions 26k Dataset", "image", "EN, JA"]
// Add more dataset data here
];
// Get the table body
const tbody = document.querySelector("#dataset-table tbody");
// Generate table rows dynamically
datasetData.forEach((data, index) => {
const row = document.createElement("tr");
row.innerHTML = `<td>${index+1}</td>`;
data.forEach(cellData => {
const cell = document.createElement("td");
cell.textContent = cellData;
row.appendChild(cell);
});
tbody.appendChild(row);
});
</script>
</body>
</html>