Skip to content

Commit

Permalink
feat: latest changes
Browse files Browse the repository at this point in the history
  • Loading branch information
AshishKumar4 committed Aug 10, 2024
1 parent d019a7e commit 76c40ce
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 51 deletions.
6 changes: 3 additions & 3 deletions datasets/custom datasets downloader.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

img2dataset --url_list $1 --input_format "parquet"\
--url_col "url" --caption_col "caption" --output_format arrayrecord\
--output_folder $2 --processes_count 64\
--thread_count 64 --image_size 256 --min_image_size 100 --number_sample_per_shard 40000 \
--output_folder $2 --processes_count 64 --thread_count 64 \
--image_size 256 --min_image_size 100 --number_sample_per_shard 80000 --max_aspect_ratio 2.4 \
--enable_wandb True --disallowed_header_directives '[]' --compute_hash None --max_shard_retry 3 --timeout 60

# gs://flaxdiff-datasets-regional/arrayrecord/laion-aesthetics-12m+mscoco-2017
# gs://flaxdiff-datasets-regional/arrayrecord/laion-aesthetics-12m+mscoco-2017
79 changes: 31 additions & 48 deletions datasets/dataset preparations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,16 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mrwhite0racle/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import webdataset as wds\n",
"import jax\n",
Expand Down Expand Up @@ -37,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -80,7 +89,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -269,37 +278,16 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"coyo700 = load_dataset(\"kakaobrain/coyo-700m\")#, num_proc=32)"
"coyo700 = load_dataset(\"kakaobrain/coyo-700m\", num_proc=32)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'coyo700' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcoyo700\u001b[49m\n",
"\u001b[0;31mNameError\u001b[0m: name 'coyo700' is not defined"
]
}
],
"source": [
"coyo700"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -312,9 +300,11 @@
"\n",
"heavyFilterMap = {\n",
" # \"word_count\": {\"min\": 0, \"max\": 100},\n",
" \"clip_similarity_vitl14\": {\"min\": 0.25, \"max\": 100},\n",
" \"aesthetic_score_laion_v2\": {\"min\": 6.05, \"max\": 100},\n",
" \"clip_similarity_vitl14\": {\"min\": 0.26, \"max\": 100},\n",
" \"aesthetic_score_laion_v2\": {\"min\": 5.0, \"max\": 100},\n",
" \"watermark_score\": {\"min\": 0, \"max\": 0.8},\n",
" \"width\": {\"min\":512, \"max\":99999},\n",
" \"height\": {\"min\":512, \"max\":99999},\n",
"}\n",
"\n",
"def coyoFilter(filterMap):\n",
Expand All @@ -329,14 +319,14 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Filter (num_proc=64): 100%|██████████| 746972269/746972269 [03:08<00:00, 3953092.18 examples/s]\n"
"Filter (num_proc=64): 100%|██████████| 746972269/746972269 [03:33<00:00, 3505937.43 examples/s]\n"
]
}
],
Expand All @@ -347,16 +337,16 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1602504"
"16586129"
]
},
"execution_count": 12,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -386,14 +376,14 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map (num_proc=16): 100%|██████████| 1602504/1602504 [00:27<00:00, 59103.40 examples/s] \n"
"Map (num_proc=16): 100%|██████████| 16586129/16586129 [01:15<00:00, 218835.67 examples/s]\n"
]
}
],
Expand All @@ -406,36 +396,29 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Creating parquet from Arrow format: 9%|▉ | 152/1603 [00:00<00:01, 741.38ba/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Creating parquet from Arrow format: 100%|██████████| 1603/1603 [00:02<00:00, 748.05ba/s]\n"
"Creating parquet from Arrow format: 100%|██████████| 16587/16587 [00:22<00:00, 729.06ba/s]\n"
]
},
{
"data": {
"text/plain": [
"330153346"
"3343770169"
]
},
"execution_count": 16,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_data.to_parquet(\"aestheticCoyo_0.25clip_6aesthetic.parquet\")"
"final_data.to_parquet(\"aestheticCoyo_0.26_clip_5.5aesthetic_256plus.parquet\")"
]
},
{
Expand Down

0 comments on commit 76c40ce

Please sign in to comment.