access NCBI database programitically-a snp case

有时候,我们需要大批量的查询ncbi数据库,这时,我们很可能需要用到ncbi为我门提供的程序化接口:Eutils。
Eutils有五大部分组成,具体的doc参见ncbi的online book。这里提供一个检索snp信息的例子:



use LWP::Simple;
use List::MoreUtils qw/ uniq /;

sub xmlparse{
    #parameter: 1,xmlstring 2,out file descriptor
    $snpfetchrst = @_[0];
    $outpfile = @_[1];
    #get rsID, validation info, allele info, gene info, maf allele info
    my $rsid;
    my $taxid;
    my $content;
    my @geneid;
    my $alleinfo;
    my $filtr = false;
    #print table head
    #print $outpfile "refSNP","\t","alleles","\t","minoralle","\t","entrezGeneID","\n";

    while ($snpfetchrst =~ /<Rs rsId=\"(\d+)\".*?taxId=\"(\d+)\">(.*?)<\/Rs>/sg) {
        @geneid = ();
        $filtr = false;
        $rsid = $1;
        $taxid = $2;
        $content = $3;
        if($taxid != 9606){
            next;
        }
           else{
            if (($content =~ /<Validation (.*?)\/>/sg)|($content =~ /<Validation .*?>(.*?)<\/Validation>/sg)) {
                $valid_state = $1;
                #print $valid_state,"\n";
                if($valid_state){    #validation state
                    $filtr = true;
                    if ($content =~ /<Sequence.*?>(.*?)<\/Sequence>/sg) {
                    $seqinfo = $1;
                    $alleinfo = $1 if ($seqinfo =~ /<Observed>(.*?)<\/Observed>/);
                    }
                          if ($content =~ /<Assembly.*?reference="true">(.*?)<\/Assembly>/sg){
                        $assemContent = $1;
                        if($assemContent =~ /<Component.*?>(.*?)<\/Component>/sg){
                            $compcontent = $1;
                            if($compcontent =~ /<MapLoc.*?>(.*?)<\/MapLoc>/sg){
                                $funcinfo = $1;
                                #print $funcinfo,"\n";
                                while($funcinfo =~ /<FxnSet geneId=\"(\d+)\".*?\/>/sg){
                                    #print $1,"\n";
                                    push(@geneid,$1);
                                }
                                @geneid = uniq(@geneid);    #require moreUtils module        
                            }
                        }    
                    }
                    if ($content =~ /<Frequency.*?allele=\"(.*?)\".*?\/>/sg){
                        $malle = $1;        
                    }
                }
                      else{
                    next;
                }
               }
            else{
                $filtr = false;
                next;
            }
        }
        if($filtr) {print $outpfile ($rsid,"\t",$alleinfo,"\t",$malle,"\t",join(',',@geneid),"\n");}
    }
}    #end of xml_parse

my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";

open($refsnp,"<","snps.txt");

while (<$refsnp>) {
    chomp;
    $_ =~ /(\d+)/;
    $rsid = $1;
    push(@rss$rsid);
}
close($refsnp);

my $id_list = join(',',@rss);
#print $id_list,"\n";

#assemble  the epost URL as an HTTP POST call

$url = $utils . "/epost.fcgi";

$url_params = "db=snp&id=$id_list";

#create HTTP user agent
$ua = new LWP::UserAgent;
$ua->agent("elink/1.0 " . $ua->agent);


#create HTTP request object
$req = new HTTP::Request POST => "$url";
$req->content_type('application/x-www-form-urlencoded');
$req->content("$url_params");


#post the HTTP request
$response = $ua->request($req); 
$epostrst = $response->content;

my $querykey;
my $webEnv;

if($epostrst =~ /<QueryKey>(.*?)<\/QueryKey>/sg){
    $querykey = $1;
    #print $querykey,"\n";
}
if($epostrst =~ /<WebEnv>(.*?)<\/WebEnv>/sg){
    $webEnv = $1;
    #print $webEnv,"\n";
}
my $count = @rss;
my $retmax = 600;
my $efetch = $utils."/efetch.fcgi?";
open(my $outpf,">>","snp_queryinfo.txt");
if(defined($querykey)&&defined($webEnv)){
    for($retstart = 0;$retstart < $count;$retstart += $retmax){
        my $progress = $retstart/$count;
        my $prog = sprintf"%.2f ",$progress);
        print "progression: $prog\n";
        $efetch_url = $efetch."db=snp&WebEnv=$webEnv&query_key=$querykey";
        $efetch_url .= "&retstart=$retstart&retmax=$retmax&retmode=xml";
        #print $efetch_url,"\n";
        $efetch_out = get($efetch_url);
        #print $efetch_out,"\n";
        xmlparse($efetch_out,$outpf);
    }
}
close($outpf);


上面的过程将读入名为snps.txt所存储的refsnp号,然后利用efetch检索数据库信息,最后从xml结果中抽提有用的信息。




posted on 2012-09-03 16:49 ewre 阅读(436) 评论(0)  编辑 收藏 引用 所属分类: perl


只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   知识库   博问   管理


导航

<2012年9月>
2627282930311
2345678
9101112131415
16171819202122
23242526272829
30123456

留言簿(2)

文章分类

文章档案

最新评论

阅读排行榜