Skip to content

Instantly share code, notes, and snippets.

@huseinzol05
Created March 9, 2025 07:12
Show Gist options
  • Save huseinzol05/e570d6ecb5fe62ccd27cf462719bcbe4 to your computer and use it in GitHub Desktop.
Save huseinzol05/e570d6ecb5fe62ccd27cf462719bcbe4 to your computer and use it in GitHub Desktop.
distributed zip files
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "83272ed4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2222136"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from glob import glob\n",
"import os\n",
"\n",
"repository = 'mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3-timestamp'\n",
"folder = 'output-audio'\n",
"files = glob(f'{folder}/*.mp3')\n",
"len(files)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3d83ced9",
"metadata": {},
"outputs": [],
"source": [
"import zipfile\n",
"import mp\n",
"import time\n",
"from huggingface_hub import HfFileSystem\n",
"from huggingface_hub import HfApi\n",
"from tqdm import tqdm\n",
"api = HfApi()\n",
"\n",
"partition_size = 5e+9"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "de709a3a",
"metadata": {},
"outputs": [],
"source": [
"def loop(files):\n",
" files, index = files\n",
" current_index = 0\n",
" api = HfApi()\n",
" fs = HfFileSystem()\n",
" total = 0\n",
" temp = []\n",
" for i in tqdm(range(len(files))):\n",
" s = os.stat(files[i]).st_size\n",
" if s + total >= partition_size:\n",
" part_name = f\"{folder}-{index}-{current_index}.zip\"\n",
" \n",
" with zipfile.ZipFile(part_name, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
" for f in temp:\n",
" zipf.write(f, arcname=f)\n",
"\n",
" while True:\n",
" try:\n",
" api.upload_file(\n",
" path_or_fileobj=part_name,\n",
" path_in_repo=part_name,\n",
" repo_id=repository,\n",
" repo_type=\"dataset\",\n",
" )\n",
" break\n",
" except:\n",
" time.sleep(60)\n",
"\n",
" os.remove(part_name)\n",
" \n",
" current_index += 1\n",
" temp = [files[i]]\n",
" total = s\n",
" else:\n",
" temp.append(files[i])\n",
" total += s\n",
" \n",
" if len(temp):\n",
" part_name = f\"{folder}-{index}-{current_index}.zip\"\n",
"\n",
" with zipfile.ZipFile(part_name, 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
" for f in temp:\n",
" zipf.write(f, arcname=f)\n",
"\n",
" while True:\n",
" try:\n",
" api.upload_file(\n",
" path_or_fileobj=part_name,\n",
" path_in_repo=part_name,\n",
" repo_id=repository,\n",
" repo_type=\"dataset\",\n",
" )\n",
" break\n",
" except:\n",
" time.sleep(60)\n",
"\n",
" os.remove(part_name)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d4efaf5d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 172371.02it/s]\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "97f9290861bf4ac1ad1be838378eb598",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-0-0.zip: 0%| | 0.00/153M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"loop((files[:1000], 0))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "813cf6f5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 5%|████▏ | 20248/370356 [00:00<00:01, 202455.09it/s]"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3c1ae58009d94ae0a41ff21c354a3e6b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-0-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "26427efd0dd944a195f83d34e43855da",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-4-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "856ca3866217441fb251b39dac735eba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-1-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9406188d85424ee7a079c24d66b34262",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-2-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f284f14e517b487cbf954e1774436a98",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-3-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5b0847a698f54fa684df830dbdbdbee6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-5-0.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 15%|███████████▌ | 54627/370356 [42:13<4:04:06, 21.56it/s]"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cb547761fbdb4a71bb80e99a8e1527b2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-3-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1c240fff0510491692fb5c86748f5bcf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-0-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e1d08915f05f464cb65942f8dd64e158",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-4-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "246f0857d3b845b8a07e95bccc5f3fee",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-1-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2ea03d565ad64cdab08a03a6a54f899e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-2-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c521032fde644e8d9b19e166a3ff19b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"output-audio-5-1.zip: 0%| | 0.00/4.94G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"mp.multiprocessing(files, loop, cores = 6, returned = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b2fc596",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "python3.10",
"language": "python",
"name": "python3.10"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment