你不知道c语言写的程序要多块——以NGS fastq文件reads计数为例

2020-08-10 15:35:42 浏览数 (1)

本人 以fastq.gz文件计数为例分别以perl语言和c语言实现了代码,具体如下:

代码语言:javascript复制
#!/usr/bin/perl -w
#
# Copyright (c)   xuxiong 2020
# Writer:         xuxiong <xuxiong19880610@163.com><xuxiong@me.com>
# Program Date:   2020.01.03
# Modifier:       xuxiong <xuxiong19880610@163.com><xuxiong@me.cn>
# Last Modified:  2020.01.03
my $ver="0.0.1";

use strict;
use Getopt::Long;
use Data::Dumper;
use FindBin qw($Bin $Script);
use FileHandle;
use File::Basename qw(basename dirname);
use List::Util qw(first max maxstr min minstr reduce shuffle sum uniq);
use Cwd qw(abs_path getcwd realpath);
use Time::HiRes qw(time);

my $infile;
GetOptions(
            "help|?" =>&USAGE,
            "i:s"=>$infile
            ) or &USAGE;
&USAGE unless ($infile ) ;
###############Time_start#############################
my $Time_Start;
$Time_Start = sub_format_datetime(localtime(time()));
print STDERR "nStart Time :[$Time_Start]nn";
my $BEGIN_TIME=time();
######################################################
my $read_count=0;
load_fastq($infile,$read_count);
print STDERR "read count: ",$read_count,"n";
sub load_fastq{
  my ($infile,$n)=@_;
  open(IN,$infile=~/.gz$/ ? "gzip -cd $infile |" : $infile) || die "Can't open $infilen";
  my $read_num=0;
  while (<IN>) {
      last if(eof(IN));
      $$n  ;
      <IN>;
      <IN>;
      <IN>;
  }
  close IN;
}

###############Time_end###########
my $Time_End;
$Time_End = sub_format_datetime(localtime(time()));
print STDERR "nEnd Time :[$Time_End]nn";
print STDERR "Time elapsed :",time()-$BEGIN_TIME," sn";
###############Sub_format_datetime
sub sub_format_datetime {#Time calculation subroutine
    my($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = @_;
    $wday = $yday = $isdst = 0;
    sprintf("M-d-d d:d:d", $year 1900, $mon 1, $day, $hour, $min, $sec);
}

sub USAGE {#
    my $usage=<<"USAGE";
Program: $0
Version: $ver
Contact: XiongXu <xuxiong@me.com> <xuxiong19880610@163.com>

Example:
  perl $0 
Description:
  This program is used for counting load_fastq file

Usage:
  -i                    infile                                 required

USAGE
    print STDERR $usage;
    exit;
}
代码语言:javascript复制
// gcc -g -O3 -Wall fastq_count_nomal.c -o fastq_count_nomal -lz
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <zlib.h>
#include <fcntl.h>
//#include <unistd.h>

struct globalArgs_t {
    const char *infile;    /* -i option */
    int verbosity;    /* -v option */
} globalArgs;

void load_file(gzFile fp,unsigned long *Line_num);
static inline int readNextNode(gzFile fq,char *buf[4]) ;
void display_usage(char * argv[]);
static inline long long usec(void);
static gzFile open_input_stream(const char *filename);

long long usec(void){
    struct timeval tv;
    gettimeofday(&tv,NULL);
    return (((long long)tv.tv_sec)*1000000) tv.tv_usec;
}

void display_usage(char * argv[]){
    char *buffer=(char* )malloc(10240*sizeof(char));
    const char* usage=
"nCopyright (c) 2020n" 
"Contact: XiongXu <xuxiong@19880610@163.cn> <xiongxu@me.com> n" 
"Discription:n  This program is used for cutting the reads to get the specified cycles.n" 
"Usage: %s [-i Infile] [-o OUTFILE] [-s start] [-e end] [-h] n" 
"Example1:n  %s -i /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz -s 0 -e 100 -o out n" 
"Example2:n  zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 100 -o sss n" 
"Example3:n  zcat /share/work1/staff/xuxiong/test/13C37198_L7_I012.R1.clean.fastq.gz |%s -e 50 |less -S  n" 
"n" 
"   [-i Infile]    = Infile.default is stdin                        [required]n" 
"   [-v ]          = verbose mod                                    [option]n" 
"   [-h] This helpful help screen.                                  [option]n" 
"n";
    sprintf(buffer,usage,argv[0],argv[0],argv[0],argv[0]);
    fprintf(stderr,"%s",buffer);
    free(buffer);
    exit(1);
}

gzFile open_input_stream(const char *filename){
    int fd ;
    if (strncmp(filename,"-", 1)==0 || !strcmp(filename,"")) {
        fd = STDIN_FILENO;
    } else {
#if defined(__APPLE__) && defined(__MACH__) || defined(unix) || defined(linux)
        fd = open(filename, O_CREAT | O_RDONLY, 0666 );
#else
        fd = _open(filename, _O_CREAT|_O_BINARY | _O_RDONLY , 0666 );
#endif
        if (fd==-1) fprintf(stderr, "Failed to create input file (%s)", filename);
    }
    gzFile in = gzdopen(fd,"rb");
    return in;
}

int readNextNode(gzFile fq,char *buf[4]) {
    int i=1;
    buf[0]=gzgets(fq,buf[0],1024*sizeof(char));
    if (!gzeof(fq)) {
        buf[1]=gzgets(fq,buf[1],1024*sizeof(char));
        buf[2]=gzgets(fq,buf[2],1024*sizeof(char));
        buf[3]=gzgets(fq,buf[3],1024*sizeof(char));
        if (globalArgs.verbosity) fprintf(stdout,"%s%s%s%s",buf[0],buf[1],buf[2],buf[3]);
    } else {
        return 0;
    }
    return i;
}

void load_file(gzFile fp,unsigned long *Line_num) {
    long long begin,end;
    begin=usec();
    char *buf[4];
    int i=0;
    for(i=0;i<4;  i) {
        buf[i] = (char *)calloc(1024,sizeof(char));
    }
    while (1) {
        int status=readNextNode(fp,buf);
        if (!status) break;
        (*Line_num)  ;
    }
    end=usec();
    fprintf(stderr,"Total_reads: %lunFinished in %.3f sn",*Line_num,(double)(end-begin)/CLOCKS_PER_SEC);
    for(i=0;i<4;  i) {
        free(buf[i]);
    }
}

int main(int argc, char *argv[])
{
    int opt = 0;
    globalArgs.infile="-";
    globalArgs.verbosity=0;
    const char *optString = "i:vh?";
    if (argc<2){
        display_usage(argv);
        exit(1);
    }
    opt = getopt( argc, argv, optString );
    while( opt != -1 ) {
        switch( opt ) {
            case 'i':
                globalArgs.infile = optarg;
                break;
            case 'v':
                globalArgs.verbosity  ;
                break;
            case '?':    /* fall-through is intentional */
            case 'h':
                display_usage(argv);
                break;
            default:
                fprintf(stderr,"error parameter!n");
                /* You won't actually get here. */
                break;
        }
        opt = getopt( argc, argv, optString );
    }
    gzFile in=open_input_stream(globalArgs.infile);
    unsigned long reads_num=0;
    load_file(in,&reads_num);
    gzclose(in);
    return 0;
}

运行效果如下:

代码语言:javascript复制
perl fastq_count.pl -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz

Start Time :[2020-01-14 12:30:10]

read count: 47861176

End Time :[2020-01-14 12:31:59]

Time elapsed :109.049475908279 s
代码语言:javascript复制
./fastq_count_normal -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz 
Total_reads: 47861176
Finished in 66.111 s

可见c写的比perl的足足快了43 s,要知道Illumina/BGI的测序仪目前下机的格式其实都是BGZF(一种兼容gzip的可并行压缩/解压格式),本人调用htslib中的bgzf,用c语言实现的工具采用6个线程运行的效果如下:

代码语言:javascript复制
./fastq_count_bgzip -t 6 -i /data1/data/tangqiong/wes-ycb/20191223/D191217P2-M192842WS_L2_1.fq.gz 
Total_reads: 47861176
Finished in 14.588 s

0 人点赞