本人 以fastq.gz文件计数为例分别以perl语言和c语言实现了代码,具体如下:
代码语言:javascript复制#!/usr/bin/perl -w
#
# Copyright (c) xuxiong 2020
# Writer: xuxiong <xuxiong19880610@163.com><xuxiong@me.com>
# Program Date: 2020.01.03
# Modifier: xuxiong <xuxiong19880610@163.com><xuxiong@me.cn>
# Last Modified: 2020.01.03
my $ver="0.0.1";
use strict;
use Getopt::Long;
use Data::Dumper;
use FindBin qw($Bin $Script);
use FileHandle;
use File::Basename qw(basename dirname);
use List::Util qw(first max maxstr min minstr reduce shuffle sum uniq);
use Cwd qw(abs_path getcwd realpath);
use Time::HiRes qw(time);
my $infile;
GetOptions(
"help|?" =>&USAGE,
"i:s"=>$infile
) or &USAGE;
&USAGE unless ($infile ) ;
###############Time_start#############################
my $Time_Start;
$Time_Start = sub_format_datetime(localtime(time()));
print STDERR "nStart Time :[$Time_Start]nn";
my $BEGIN_TIME=time();
######################################################
my $read_count=0;
load_fastq($infile,$read_count);
print STDERR "read count: ",$read_count,"n";
sub load_fastq{
my ($infile,$n)=@_;
open(IN,$infile=~/.gz$/ ? "gzip -cd $infile |" : $infile) || die "Can't open $infilen";
my $read_num=0;
while (<IN>) {
last if(eof(IN));
$$n ;
<IN>;
<IN>;
<IN>;
}
close IN;
}
###############Time_end###########
my $Time_End;
$Time_End = sub_format_datetime(localtime(time()));
print STDERR "nEnd Time :[$Time_End]nn";
print STDERR "Time elapsed :",time()-$BEGIN_TIME," sn";
###############Sub_format_datetime
sub sub_format_datetime {#Time calculation subroutine
my($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = @_;
$wday = $yday = $isdst = 0;
sprintf("M-d-d d:d:d", $year 1900, $mon 1, $day, $hour, $min, $sec);
}
sub USAGE {#
my $usage=<<"USAGE";
Program: $0
Version: $ver
Contact: XiongXu <xuxiong@me.com> <xuxiong19880610@163.com>
Example:
perl $0
Description:
This program is used for counting load_fastq file
Usage:
-i infile required
USAGE
print STDERR $usage;
exit;
}
代码语言:javascript复制// gcc -g -O3 -Wall fastq_count_nomal.c -o fastq_count_nomal -lz
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <zlib.h>
#include <fcntl.h>
//#include <unistd.h>
struct globalArgs_t {
const char *infile; /* -i option */
int verbosity; /* -v option */
} globalArgs;
void load_file(gzFile fp,unsigned long *Line_num);
static inline int readNextNode(gzFile fq,char *buf[4]) ;
void display_usage(char * argv[]);
static inline long long usec(void);
static gzFile open_input_stream(const char *filename);
long long usec(void){
struct timeval tv;
gettimeofday(&tv,NULL);
return (((long long)tv.tv_sec)*1000000) tv.tv_usec;
}
void display_usage(char * argv[]){
char *buffer=(char* )malloc(10240*sizeof(char));
const char* usage=
"nCopyright (c) 2020n"
"Contact: XiongXu <xuxiong@19880610@163.cn> <xiongxu@me.com> n"
"Discription:n This program is used for cutting the reads to get the specified cycles.n"
"Usage: %s [-i Infile] [-o OUTFILE] [-s start] [-e end] [-h] n"
"Example1:n %s -i /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz -s 0 -e 100 -o out n"
"Example2:n zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 100 -o sss n"
"Example3:n zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 50 |less -S n"
"n"
" [-i Infile] = Infile.default is stdin [required]n"
" [-v ] = verbose mod [option]n"
" [-h] This helpful help screen. [option]n"
"n";
sprintf(buffer,usage,argv[0],argv[0],argv[0],argv[0]);
fprintf(stderr,"%s",buffer);
free(buffer);
exit(1);
}
gzFile open_input_stream(const char *filename){
int fd ;
if (strncmp(filename,"-", 1)==0 || !strcmp(filename,"")) {
fd = STDIN_FILENO;
} else {
#if defined(__APPLE__) && defined(__MACH__) || defined(unix) || defined(linux)
fd = open(filename, O_CREAT | O_RDONLY, 0666 );
#else
fd = _open(filename, _O_CREAT|_O_BINARY | _O_RDONLY , 0666 );
#endif
if (fd==-1) fprintf(stderr, "Failed to create input file (%s)", filename);
}
gzFile in = gzdopen(fd,"rb");
return in;
}
int readNextNode(gzFile fq,char *buf[4]) {
int i=1;
buf[0]=gzgets(fq,buf[0],1024*sizeof(char));
if (!gzeof(fq)) {
buf[1]=gzgets(fq,buf[1],1024*sizeof(char));
buf[2]=gzgets(fq,buf[2],1024*sizeof(char));
buf[3]=gzgets(fq,buf[3],1024*sizeof(char));
if (globalArgs.verbosity) fprintf(stdout,"%s%s%s%s",buf[0],buf[1],buf[2],buf[3]);
} else {
return 0;
}
return i;
}
void load_file(gzFile fp,unsigned long *Line_num) {
long long begin,end;
begin=usec();
char *buf[4];
int i=0;
for(i=0;i<4; i) {
buf[i] = (char *)calloc(1024,sizeof(char));
}
while (1) {
int status=readNextNode(fp,buf);
if (!status) break;
(*Line_num) ;
}
end=usec();
fprintf(stderr,"Total_reads: %lunFinished in %.3f sn",*Line_num,(double)(end-begin)/CLOCKS_PER_SEC);
for(i=0;i<4; i) {
free(buf[i]);
}
}
int main(int argc, char *argv[])
{
int opt = 0;
globalArgs.infile="-";
globalArgs.verbosity=0;
const char *optString = "i:vh?";
if (argc<2){
display_usage(argv);
exit(1);
}
opt = getopt( argc, argv, optString );
while( opt != -1 ) {
switch( opt ) {
case 'i':
globalArgs.infile = optarg;
break;
case 'v':
globalArgs.verbosity ;
break;
case '?': /* fall-through is intentional */
case 'h':
display_usage(argv);
break;
default:
fprintf(stderr,"error parameter!n");
/* You won't actually get here. */
break;
}
opt = getopt( argc, argv, optString );
}
gzFile in=open_input_stream(globalArgs.infile);
unsigned long reads_num=0;
load_file(in,&reads_num);
gzclose(in);
return 0;
}
运行效果如下:
代码语言:javascript复制perl fastq_count.pl -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz
Start Time :[2020-01-14 12:30:10]
read count: 47861176
End Time :[2020-01-14 12:31:59]
Time elapsed :109.049475908279 s
代码语言:javascript复制./fastq_count_normal -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz
Total_reads: 47861176
Finished in 66.111 s
可见c写的比perl的足足快了43 s,要知道Illumina/BGI的测序仪目前下机的格式其实都是BGZF(一种兼容gzip的可并行压缩/解压格式),本人调用htslib中的bgzf,用c语言实现的工具采用6个线程运行的效果如下:
代码语言:javascript复制./fastq_count_bgzip -t 6 -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz
Total_reads: 47861176
Finished in 14.588 s