export HF_TOKEN=your_huggingface_token
export AWS_ACCESS_KEY_ID=your_s3_access_id
export AWS_SECRET_ACCESS_KEY=your_s3_access_keyDownload the full training dataset from Hugging Face: The Stack V2 - Train Full IDs
bash download_huggingface.sh | xargs -n 1 -P 20 wget -c --no-check-certificate --header="Authorization: Bearer ${HF_TOKEN}"2. Extract blob_id, repo_name, file_path, encode, language from the-stack-v2-train-full-ids into ./stackv2/blob_ids/
python extract_info.pyData files will be stored in ./stackv2/blob_ids/ with the .jsonl suffix.
# On machine A
nohup python get_s3.py 0 500 > log0_500.txt 2>&1 &
nohup python get_s3.py 500 1000 > log500_1000.txt 2>&1 &
nohup python get_s3.py 1000 1500 > log1000_1500.txt 2>&1 &
nohup python get_s3.py 1500 2000 > log1500_2000.txt 2>&1 &
nohup python get_s3.py 2000 2500 > log2000_2500.txt 2>&1 &
# On machine B
nohup python get_s3.py 2500 3000 > log2500_3000.txt 2>&1 &
nohup python get_s3.py 3000 3500 > log3000_3500.txt 2>&1 &
nohup python get_s3.py 3500 4000 > log3500_4000.txt 2>&1 &
nohup python get_s3.py 4000 4500 > log4000_4500.txt 2>&1 &
nohup python get_s3.py 4500 5000 > log4500_5000.txt 2>&1 &TIPS: Each machine should have at least 24GB of memory ( It costs 15GB memory for 5 processors indeed).