DownloadSRA | Notes

iseq

BioOmics/iSeq
Get fastq.gz data easily

# download.sh
cd $1 || { echo "Error: Unable to change directory to \$1"; exit 1; }
mkdir raw
cat SRR_Acc_List.txt | while read Run; 
do
    cd raw
    iseq -i $Run -a -g
done

## bash
sh download.sh [path]
## struc
/path
/path/SRR_Acc_List.txt

Create the raw directory in path, and the downloaded data will be saved here

pysradb

pysradb
Get GSE metadata

1
2
3

import pandas as pd
from pysradb.sraweb import SRAweb
db = SRAweb()

gse = 'GSE197726'
srp = db.gse_to_srp(gse)
srp
## output
study_alias	study_accession
0	GSE197726	SRP362072

# Output all SRR metadata corresponding to the GSE 
df_meta = db.sra_metadata(srp['study_accession'], detailed=True)
print(df_meta.columns)
df_meta.head(5)

# modify

col = ['run_accession','library_name','study_title','organism_name','library_strategy','library_source',
        'source_name','tissue','circadian time','genotype','treatment','sex','cell type','age]

df_index = df_meta.loc[:,col]

## modify 
# df_index = df_index.rename(columns={'time point': 'time'})
# df_index['time'] = df_index['time'].apply(lambda x: 'CT' + str(x))
# df_index['time'] = df_index['time'].apply(lambda x: str(x).replace(" ",""))
# df_index['time'] = df_index['experiment_title'].apply(lambda x: str(x).split(';')[0].split('_')[-2])
# df_index['time'] = df_index['time'].apply(lambda x: 'ZT' + str(x))
# df_index = df_index.drop('experiment_title', axis=1)

df_index.insert(0,'gse_accession',[gse]*len(df_index))
df_index

fasterq-dump

fasterq dump
convert SRR data to fastq data

# run one
fasterq-dump -e 10 -p --split-3 -O [outdir] [sra]

# 
cd $1 || { echo "Error: Unable to change directory to \$1"; exit 1; }
mkdir fastq
fqdir="$1/fastq"

for sra_file in $1/SRR*[0-9];
do
  start_time=$(date +"%Y-%m-%d %H:%M:%S")
  fasterq-dump -e 10 -p --split-3 -O ${fqdir} ${sra_file}
  end_time=$(date +"%Y-%m-%d %H:%M:%S")
  echo "Sample ID: ${sra_file} - Start Time: ${start_time} - End Time: ${end_time}"
done

parallel-fastq-dump

parallel-fastq-dump
convert SRR data to fastq/fastq.gz data

# run one
parallel-fastq-dump -s [sra_file] -t 8  -O [outdir] --split-files --gzip

#
cd $1 || { echo "Error: Unable to change directory to \$1"; exit 1; }
mkdir fastq
fqdir="$1/fastq"

for sra_file in $1/SRR*[0-9];
do
  start_time=$(date +"%Y-%m-%d %H:%M:%S")
  parallel-fastq-dump -s ${sra_file} -t 8  -O ${fqdir} --split-files --gzip 
  end_time=$(date +"%Y-%m-%d %H:%M:%S")
  echo "Sample ID: ${sra_file} - Start Time: ${start_time} - End Time: ${end_time}"
done