700字范文,内容丰富有趣,生活中的好帮手!
700字范文 > Java版PageRank及网站收录情况查询代码

Java版PageRank及网站收录情况查询代码

时间:2020-10-26 11:41:57

相关推荐

Java版PageRank及网站收录情况查询代码

在Google这个由10的100次方得名的站点中,各种评估网站的算法层出不穷,而PageRank即是其中之一。

Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是,每个到页面的链接都是对该页面的一次投票,被链接的越多,就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多,一般判断这篇论文的权威性就越高。

通常情况下讲,原创内容越多的站点,PageRank越容易提升,反之则相对比较困难,PageRank最大上限值为10。在Google的评估中,能上10的网站真可谓凤毛麟角,即使算上Google,能成就PageRank 10这“伟业”者,望眼环球也不足40家。一般来说,个人站点评估值4即办的不错,商业网站到6以上便算步入正轨了。

网上虽然有不少现成的查询器及源码,但是光用别人的毕竟不符合程序员风格,所以今天自己用Java重造轮子又写了个PageRank查询实现,捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。

源码如下:

GooglePageRank.java

packageorg.loon.test;importjava.io.IOException;importjava.util.Random;importjava.util.regex.Matcher;importjava.util.regex.Pattern;/***Copyright**LicensedundertheApacheLicense,Version2.0(the"License");youmaynot*usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof*theLicenseat**/licenses/LICENSE-2.0**Unlessrequiredbyapplicablelaworagreedtoinwriting,software*distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT*WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe*Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder*theLicense.**@projectloonframework*@authorchenpeng*@email:ceponline@*@version0.1*/publicclassGooglePageRank{//googlepagerank服务器ip地址列表(最近google小气了很多,反复查询一个封ip)finalstaticString[]GoogleServiceIP=newString[]{"64.233.161.100","64.233.161.101","64.233.183.91","64.233.189.44","66.102.1.103","66.102.9.115","66.249.89.83","66.249.91.99","66.249.93.190"};//google用识别标记finalstaticprivateintGOOGLE_MAGIC=0xE6359A60;//ch数值混合器privateclassCHMix{inta;intb;intc;publicCHMix(){this(0,0,0);}publicCHMix(inta,intb,intc){this.a=a;this.b=b;this.c=c;}}/***按google要求混合成ch数据**@parammix*/privatestaticvoidmix(finalCHMixmix){mix.a-=mix.b;mix.a-=mix.c;mix.a^=mix.c>>13;mix.b-=mix.c;mix.b-=mix.a;mix.b^=mix.a<<8;mix.c-=mix.a;mix.c-=mix.b;mix.c^=mix.b>>13;mix.a-=mix.b;mix.a-=mix.c;mix.a^=mix.c>>12;mix.b-=mix.c;mix.b-=mix.a;mix.b^=mix.a<<16;mix.c-=mix.a;mix.c-=mix.b;mix.c^=mix.b>>5;mix.a-=mix.b;mix.a-=mix.c;mix.a^=mix.c>>3;mix.b-=mix.c;mix.b-=mix.a;mix.b^=mix.a<<10;mix.c-=mix.a;mix.c-=mix.b;mix.c^=mix.b>>15;}/***获得ch数值混合器**@return*/publicstaticCHMixgetInnerCHMix(){returnnewGooglePageRank().newCHMix();}/***通过url获得googlech(google数据库针对页面的全球唯一标识)**@paramurl*@return*/publicstaticStringGoogleCH(finalStringurl){//格式化为google要求的info:url模式StringnUrl=String.format("info:%s",newObject[]{url});//获得新url字符串格式char[]urls=nUrl.toCharArray();//获得新url长度intlength=urls.length;//获得一个ch数值混合器CHMixchMix=GooglePageRank.getInnerCHMix();//为c注入google识别标识chMix.c=GOOGLE_MAGIC;//为a、b项注入google要求的初始标识chMix.a=chMix.b=0x9E3779B9;intk=0;intlen=length;while(len>=12){chMix.a+=(int)(urls[k+0]+(urls[k+1]<<8)+(urls[k+2]<<16)+(urls[k+3]<<24));chMix.b+=(int)(urls[k+4]+(urls[k+5]<<8)+(urls[k+6]<<16)+(urls[k+7]<<24));chMix.c+=(int)(urls[k+8]+(urls[k+9]<<8)+(urls[k+10]<<16)+(urls[k+11]<<24));//获得混合运算后的数据GooglePageRank.mix(chMix);k+=12;len-=12;}chMix.c+=length;//产生googlech的11位标识switch(len){case11:chMix.c+=(int)(urls[k+10]<<24);case10:chMix.c+=(int)(urls[k+9]<<16);case9:chMix.c+=(int)(urls[k+8]<<8);case8:chMix.b+=(int)(urls[k+7]<<24);case7:chMix.b+=(int)(urls[k+6]<<16);case6:chMix.b+=(int)(urls[k+5]<<8);case5:chMix.b+=(int)(urls[k+4]);case4:chMix.a+=(int)(urls[k+3]<<24);case3:chMix.a+=(int)(urls[k+2]<<16);case2:chMix.a+=(int)(urls[k+1]<<8);case1:chMix.a+=(int)(urls[k+0]);break;default:break;}//获得混合运算后的数据GooglePageRank.mix(chMix);//获得未修订的CHStringtch=String.valueOf(chMix.c);//矫正差值后反馈正确CHreturnString.format("6%s",newObject[]{tch.length()<10?("-"+tch).intern():tch});}/***正则匹配pagerank结果**@paramvalue*@return*/privatestaticStringMatchRank(finalStringvalue){Patternpattern=pile("Rank_1:[0-9]:([0-9]+)");Matchermatcher=pattern.matcher(value);if(matcher.find()){returnmatcher.group(1);}return"0";}/***获得指定页面的googlepagerank值**@paramurl*@return*/publicstaticStringGooglePR(finalStringurl){Stringrip=GoogleServiceIP[newRandom().nextInt(GoogleServiceIP.length)];returnGooglePR(url,rip);}/***以指定的google服务器获得指定页面的googlepagerank值**@paramurl*@paramip*@return*/publicstaticStringGooglePR(finalStringurl,finalStringip){//产生查询用唯一标识Stringchecksum=GoogleCH(url);//产生查询用urlStringqueryUrl=String.format("http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s",newObject[]{ip,checksum,url});Stringresponse;try{response=SimpleWebClient.getRequestHttp(queryUrl);}catch(IOExceptione){response="";}if(response.length()==0){return"0";}else{returnGooglePageRank.MatchRank(response);}}}

SimpleWebClient.java

packageorg.loon.test;importjava.io.BufferedInputStream;importjava.io.ByteArrayOutputStream;importjava.io.IOException;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.io.OutputStreamWriter;.HttpURLConnection;.URL;importjava.util.HashMap;importjava.util.Iterator;importjava.util.Map;importjava.util.Set;importjava.util.Map.Entry;importsun.misc.BASE64Encoder;/***Copyright**LicensedundertheApacheLicense,Version2.0(the"License");youmaynot*usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof*theLicenseat**/licenses/LICENSE-2.0**Unlessrequiredbyapplicablelaworagreedtoinwriting,software*distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT*WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe*Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder*theLicense.**@projectloonframework*@authorchenpeng*@email:ceponline@*@version0.1*/publicclassSimpleWebClient{/***向指定url发送请求并获得响应数据**@paramurlString*@return*@throwsIOException*/publicstaticStringgetRequestHttp(StringurlString)throwsIOException{returngetRequestHttp(urlString,"utf-8");}/***向指定url发送请求并获得响应数据**@paramurlString*@paramencoding*@return*@throwsIOException*/publicstaticStringgetRequestHttp(StringurlString,Stringencoding)throwsIOException{returngetRequestHttp(urlString,encoding,null,5000);}/***向指定url发送请求并获得响应数据**@paramurlString*@paramencoding*@paramparameter*@return*@throwsIOException*/publicstaticStringgetRequestHttp(finalStringurlString,finalStringencoding,finalMapparameter,finalinttimeout)throwsIOException{StringnURL=(urlString.startsWith("http://")||urlString.startsWith("https://"))?urlString:("http:"+urlString).intern();Stringuser=null;Stringpassword=null;Stringmethod="GET";Stringpost=null;Stringdigest=null;StringresponseContent="ERROR";booleanfoundRedirect=false;Mapheaders=newHashMap();if(parameter!=null){SetentrySet=parameter.entrySet();for(Iteratorit=entrySet.iterator();it.hasNext();){Entryheader=(Entry)it.next();Stringkey=(String)header.getKey();Stringvalue=(String)header.getValue();if("user".equals(key)){user=value;}elseif("pass".equals(key)){password=value;}elseif("method".equals(key)){method=value;}elseif("post".equals(key)){post=value;}else{headers.put(key,value);}}}URLurl=newURL(nURL);if(user!=null&&password!=null){BASE64Encoderbase64=newBASE64Encoder();digest="Basic"+base64.encode((user+":"+password).getBytes());}do{HttpURLConnectionurlConnection=(HttpURLConnection)url.openConnection();//添加访问授权if(digest!=null){urlConnection.setRequestProperty("Authorization",digest);}urlConnection.setDoOutput(true);urlConnection.setDoInput(true);urlConnection.setUseCaches(false);urlConnection.setInstanceFollowRedirects(false);urlConnection.setRequestMethod(method);if(timeout>0){urlConnection.setConnectTimeout(timeout);}//模拟http头文件urlConnection.setRequestProperty("User-Agent","Mozilla/4.0(compatible;MSIE7.0;)");urlConnection.setRequestProperty("Accept","image/gif,image/x-xbitmap,image/jpeg,image/pjpeg,application/x-shockwave-flash,application/msword,application/vnd.ms-excel,application/vnd.ms-powerpoint,*/*");//追加http头文件SetheadersSet=headers.entrySet();for(Iteratorit=headersSet.iterator();it.hasNext();){Entryentry=(Entry)it.next();urlConnection.setRequestProperty((String)entry.getKey(),(String)entry.getValue());}if(post!=null){OutputStreamWriteroutRemote=newOutputStreamWriter(urlConnection.getOutputStream());outRemote.write(post);outRemote.flush();}//获得响应状态intresponseCode=urlConnection.getResponseCode();//获得返回的数据长度intresponseLength=urlConnection.getContentLength();if(responseCode==302){//重定向Stringlocation=urlConnection.getHeaderField("Location");url=newURL(location);foundRedirect=true;}else{BufferedInputStreamin;if(responseCode==200||responseCode==201){in=newBufferedInputStream(urlConnection.getInputStream());}else{in=newBufferedInputStream(urlConnection.getErrorStream());}intsize=responseLength==-1?4096:responseLength;if(encoding!=null){responseContent=SimpleWebClient.read(in,size,encoding);}else{ByteArrayOutputStreamout=newByteArrayOutputStream();byte[]bytes=newbyte[size];intread;while((read=in.read(bytes))>=0){out.write(bytes,0,read);}responseContent=newString(out.toByteArray());in.close();out.close();}foundRedirect=false;}//如果重定向则继续}while(foundRedirect);returnresponseContent;}/***转化InputStream为String**@paramin*@paramsize*@return*@throwsIOException*/privatestaticStringread(finalInputStreamin,finalintsize,finalStringencoding)throwsIOException{StringBuildersbr=newStringBuilder();intnSize=size;if(nSize==0){nSize=1;}char[]buffer=newchar[nSize];intoffset=0;InputStreamReaderisr=newInputStreamReader(in,encoding);while((offset=isr.read(buffer))!=-1){sbr.append(buffer,0,offset);}in.close();isr.close();returnsbr.toString();}}

WebAppraise.java

packageorg.loon.test;importjava.io.IOException;/***Copyright**LicensedundertheApacheLicense,Version2.0(the"License");youmaynot*usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof*theLicenseat**/licenses/LICENSE-2.0**Unlessrequiredbyapplicablelaworagreedtoinwriting,software*distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT*WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe*Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder*theLicense.**@projectloonframework*@authorchenpeng*@email:ceponline@*@version0.1*/publicclassWebAppraise{privateStringgoogleSum;privateStringbaiduSum;privateStringmsnSum;privateStringaltaVistaSum;privateStringallTheWebSum;privateStringyahooSum;privateStringtestURL;publicWebAppraise(finalStringurl){if(url!=null&&!"".equals(url)){this.testURL=url.trim();if(this.testURL.startsWith("http://")){this.testURL=this.testURL.substring(7);}if(this.testURL.startsWith("https://")){this.testURL=this.testURL.substring(8);}}else{thrownewRuntimeException("urlisNULL!");}}/***分析指定链接结果,并返回整型数值**@paramsearchURL*@paramanchor*@paramtrail*@return*/privatestaticintgetLinks(finalStringsearchURL,finalStringanchor,finalStringtrail){intcount=0;StringserverResponse;try{//我国特色……if(searchURL.startsWith("")){//永不离休的gb2312同志(-_-||)serverResponse=SimpleWebClient.getRequestHttp(searchURL,"gb2312");}else{serverResponse=SimpleWebClient.getRequestHttp(searchURL);}}catch(IOExceptione){serverResponse=e.getMessage();}intpos=serverResponse.indexOf(anchor);if(pos>1){serverResponse=serverResponse.substring(pos+anchor.length());pos=serverResponse.indexOf(trail);Stringvalue=serverResponse.substring(0,pos).trim();value=value.replace(",","");value=value.replace(".","");count=Integer.parseInt(value);}returncount;}publicStringgetAllTheWebSite(){returngetAllTheWebSite(false);}publicStringgetAllTheWebSite(booleanisDomain){try{StringallTheWeb;if(isDomain){allTheWeb="/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"+this.testURL;}else{allTheWeb="/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"+this.testURL+"&_sb_lang=any";}allTheWebSum=""+getLinks(allTheWeb,"<spanclass=/"ofSoMany/">","</span>");}catch(Exceptionex){allTheWebSum=ex.getMessage();}returnallTheWebSum;}publicStringgetAltaVistaSite(){returngetAltaVistaSite(false);}publicStringgetAltaVistaSite(booleanisDomain){try{StringaltaVista;if(isDomain){altaVista="/web/results?itag=ody&q=link%3A"+this.testURL+"&kgs=0&kls=0";}else{altaVista="/web/results?itag=ody&kgs=0&kls=0&q=site%3A"+this.testURL;}altaVistaSum=""+getLinks(altaVista,"AltaVistafound","");}catch(Exceptionex){altaVistaSum=ex.getMessage();}returnaltaVistaSum;}publicStringgetGooglePR(){returnGooglePageRank.GooglePR(this.testURL);}publicStringgetGoogleSite(){returngetGoogleSite(false);}publicStringgetGoogleSite(finalbooleanisDomian){try{Stringgoogle;//反向链接if(isDomian){google="/search?hl=en&q=link%3A"+this.testURL;}else{google="/search?hl=en&q=site%3A"+this.testURL+"&btnG=Google+Search&aq=f&oq=";}googleSum=""+getLinks(google,"about<b>","</b>");}catch(Exceptionex){googleSum=ex.getMessage();}returngoogleSum;}publicStringgetBaiduSite(){returngetBaiduSite(false);}publicStringgetBaiduSite(finalbooleanisDomian){try{Stringbaidu;if(isDomian){baidu="/s?wd=domain%3A"+this.testURL+"&cl=3";}else{baidu="/s?wd=site%3A"+this.testURL;}baiduSum=""+getLinks(baidu,"找到相关网页","篇");}catch(Exceptionex){Stringbaidu;if(isDomian){baidu="/s?wd=domain%3A"+this.testURL+"&cl=3";}else{baidu="/s?wd=site%3A"+this.testURL;}baiduSum=""+getLinks(baidu,"找到相关网页约","篇");}returnbaiduSum;}publicStringgetYahooSite(){returngetYahooSite(false);}publicStringgetYahooSite(finalbooleanisDomian){try{Stringyahoo;if(isDomian){yahoo="http://sitemap./search?p="+this.testURL+"&bwm=i";yahooSum=""+getLinks(yahoo,"<strong>","</strong>");}else{yahoo="/s?p=site%3A"+this.testURL+"&pid=hp&v=web";yahooSum=""+getLinks(yahoo,"找到相关网页约","条");}}catch(Exceptionex){yahooSum=ex.getMessage();}returnyahooSum;}publicStringgetMsnSite(){returngetMsnSite(false);}publicStringgetMsnSite(booleanisDomain){try{Stringmsn;if(isDomain){msn="http://cnweb./results.aspx?q=link%3A"+this.testURL+"&mkt=zh-cn&scope=&FORM=LIVSO";}else{msn="http://cnweb./results.aspx?q=site%3A"+this.testURL+"&go=&form=QBRE";}msnSum=""+getLinks(msn,"共","条搜索结果");}catch(Exceptionex){msnSum=ex.getMessage();}returnmsnSum;}publicStringgetTestURL(){returntestURL;}}

Test.java

packageorg.loon.test;/***Copyright**LicensedundertheApacheLicense,Version2.0(the"License");youmaynot*usethisfileexceptincompliancewiththeLicense.Youmayobtainacopyof*theLicenseat**/licenses/LICENSE-2.0**Unlessrequiredbyapplicablelaworagreedtoinwriting,software*distributedundertheLicenseisdistributedonan"ASIS"BASIS,WITHOUT*WARRANTIESORCONDITIONSOFANYKIND,eitherexpressorimplied.Seethe*Licenseforthespecificlanguagegoverningpermissionsandlimitationsunder*theLicense.**@projectloonframework*@authorchenpeng*@email:ceponline@*@version0.1*/publicclassTest{publicstaticvoidmain(String[]args){WebAppraiseappraise=newWebAppraise("/cping1982");System.out.println("GooglePagerRank值:"+appraise.getGooglePR());System.out.println("google收录:"+appraise.getGoogleSite());System.out.println("google反向收录:"+appraise.getGoogleSite(true));System.out.println("yahoo收录:"+appraise.getYahooSite());System.out.println("yahoo反向收录:"+appraise.getYahooSite(true));System.out.println("baidu收录:"+appraise.getBaiduSite());System.out.println("baidu反向收录:"+appraise.getBaiduSite(true));System.out.println("msn收录:"+appraise.getMsnSite());System.out.println("msn反向收录:"+appraise.getMsnSite(true));System.out.println("AllTheWeb收录:"+appraise.getAllTheWebSite());System.out.println("AllTheWeb反向收录:"+appraise.getAllTheWebSite(true));System.out.println("AltaVista收录:"+appraise.getAltaVistaSite());System.out.println("AltaVista反向收录:"+appraise.getAltaVistaSite(true));}}

检测/cping1982运行结果如下图:

源码下载地址:/source/929348

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。