数据集下载:https://huggingface.co/datasets/mlfoundations/datacomp_large
import os
from huggingface_hub import snapshot_download
def download_parquet_files(repo_id, output_dir):
"""
Download .parquet files from a Hugging Face dataset repository using snapshot_download.
Args:
- repo_id (str): The ID of the Hugging Face dataset repository.
- output_dir (str): Directory where the .parquet files will be saved.
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
cache_dir = os.path.join(output_dir, "cache")
hf_snapshot_args = dict(
repo_id=repo_id,
allow_patterns="*.parquet",
local_dir=output_dir,
cache_dir=cache_dir,
local_dir_use_symlinks=False,
repo_type="dataset",
resume_download=True,
max_workers=16
)
snapshot_download(**hf_snapshot_args)
if __name__ == "__main__":
REPO_ID = "mlfoundations/datacomp_large" # Replace with your dataset repo ID
OUTPUT_DIR = "/data/xiedong/datasets_meizu/datacomp_all/large/metadata" # Replace with your desired output directory
download_parquet_files(REPO_ID, OUTPUT_DIR)
执行下载代码前可以看看这个,防止断开后再下载的时候会重复下载。