原理
关于获取网页信息,小编其实在很早就有写到SAS-爬取帖子下的邮箱,给他们发一封邮件...嗯,其实也就是通过获取网页代码,从网页代码中提取出需要信息~还是看一张截图(谷歌浏览器按F12即可查看也面代码)
原理就是将这些,写入到SAS数据集中,在查看需要提取的字符串出现的规律是怎么样的,然后基于规律写程序就可以了~这个主要就是看是否能发现数据的规律~不细说了,下面来上代码!
代码语言:javascript复制/*****************************SAS 爬取 化工ATC数据库*******************************/
*1获取首页网站ATC编码;
options nomprint nocenter nosymbolgen orientation=landscape nodate nonumber nobyline missing = ' ' nomlogic
formchar="|----| |--- =|-/<>*" validvarname=upcase nofmterr
mrecall noxwait nosource nosource2 ;
options extendobscounter=no compress=yes;
option compress=yes missing='';
options nosource nosource2 nonotes nomprint;
options source source2 notes nomprint;
%macro getfirst(url=%nrstr("http://cheman.chemnet.com/atc/"),outds=%str(atc_1));
/*filename temp_sa1 url &url.;*/
data temp;
length text $20000.;
informat text $20000.;
format text $20000.;
infile &url. url DSD missover lrecl=100000000 dlm='~~~~';
input text $;
retain gp ;
order=_n_;
if text="<ul>" then tp=1;
else tp=0 ;
if _n_=1 then gp=tp;
else gp=gp tp;
if missing(text) then delete;
run;
quit;
proc sort data=temp out=temp sortseq=linguistic(numeric_collation=on);by gp order ;run;
data temp2;
length text final $20000. ;
set temp;
by gp order;
retain final;
if first.gp then final=text;
else final=cats(' ',final,text);
if compress(text)='</ul>' and gp>3 then output;
run;
data &outds.;
set temp2;
finals=tranwrd(final,'<ul>','');
finals=tranwrd(finals,'<li>','');
finals=tranwrd(finals,'</li>','');
finals=tranwrd(finals,'<a href="search.cgi?terms=','');
finals=tranwrd(finals,'&c=atc_number" class="blue fb fftt">','');
finals=compbl(tranwrd(finals,'</a',''));
ATC_CODE=strip(scan(finals,1,' '));
/*ATC_CODE=substr(final,find(final,'<a href="search.cgi?terms='),find(final,'&c=atc_number')-find(final,'<a href="search.cgi?terms='));*/
/*ATC_CODE=strip(tranwrd(ATC_CODE,'<a href="search.cgi?terms=',''));*/
ATC_EN_NAM=substr(final,find(final,'class="blue fb fftt">'),find(final,'</a>')-find(final,'class="blue fb fftt">'));
ATC_EN_NAM=strip(tranwrd(ATC_EN_NAM,'class="blue fb fftt">',''));
ATC_CN_NAM=strip(scan(finals,2,'>'));
ATC_CN_NAM=strip(scan(ATC_CN_NAM,1,'<'));
url=strip('"http://cheman.chemnet.com/atc/')||strip('search.cgi?terms=')||strip(ATC_CODE)||strip('&c=atc_number"');
keep url ATC_CODE ATC_CN_NAM ATC_EN_NAM;
run;
quit;
proc delete data=temp temp2 ;quit;
%mend;
proc datasets library=work kill nolist;
quit;
/**/
%getfirst;
%getfirst直接调用,就是获取ATC首页的信息,因为宏中设置了宏参数的默认值...
第一层级获取了,还要获取各个层级,看网页的代码,其实很容易发现大层级跳转到子层级的规律,这个时候就可以基于规律生成一个URL,然后在通过循环一层一层的获取...
代码语言:javascript复制*2基于第一步获取的ATC首页信息以及内容后 进行逐条循环获取各类下的数据;
%macro getson(inds=,seq=,cod=);
%let dsid=%sysfunc(open(&inds.));
%let heve=%sysfunc(attrn(&dsid,nobs));
%let rc= %sysfunc(close(&dsid));
%put 此层级共抓取URL:&heve;
%if &heve.>0 %then %do;
data _null_;
set &inds. end=last;
/* if _n_=1 then */
call execute(strip('%getfirst(url=%nrstr(')||strip(url)||strip('),outds=T&seq._')||strip(_N_)||strip(')'));
if last then call symput('nobs',strip(_N_));
run;
quit;
data atc_&seq.;
set T&seq._1-T&seq._&nobs.;
&cod.
run;
quit;
proc delete data=T&seq._1-T&seq._&nobs.;quit;
%end;
%put **********************************************************************************;
%put Log:已经获取第 &seq. 层级数据集;
%put ***********************************************************************************;
%mend;
%getson(inds=atc_1,seq=2,cod=%str(where length(ATC_CODE)=3;))
%getson(inds=atc_2,seq=3,cod=%str(where length(ATC_CODE)=4;))
%getson(inds=atc_3,seq=4,cod=%str(where length(ATC_CODE)=5;))
%getson(inds=atc_4,seq=5,cod=%str(where length(ATC_CODE)=7;))
如上代码,其实也就很容易得到,ATC1-ATC5层级的数据,基于数据在进过merge等操作的整合,便能生成一个线下的编码词典库。我想,应该可以比较方便编码人员进行ATC的编码。接下来看看最终生成的ATC的Excel是啥样的。(代码写的很粗糙,有很多细节可以优化,不过呀,小编也就运行一次,目的达成了,就懒得再优化了)
基本就是这样了...