#coding=gbk import osimport sysimport reimport timeimport urllib2  def perror_and_exit(message, status= -1):    sys.stderr.write(message+ '\n')    sys.exit(status)  def get_text_from_html_tag(html):    pattern_text=  re.compile(r">.*?   return pattern_text.findall(html)[0][1:-2].strip()  def parse_alexa(url):    url_alexa= "http://icp.alexa.cn/index.php?q=%s" % url    print url_alexa    #handle exception     times= 0    while times <5000:#等待有一定次数限制         try:            alexa= urllib2.urlopen(url_alexa).read()              pattern_table= re.compile(r".*?", re.DOTALL | re.MULTILINE)            match_table= pattern_table.search(alexa)            if not match_table:                raise BaseException("No table in HTML")            break        except:            print "try %s times:sleep %s seconds" % (times,2**times)            times+= 1            time.sleep(2**times)            continue      table= match_table.group()    pattern_tr= re.compile(r".*?", re.DOTALL | re.MULTILINE)    match_tr= pattern_tr.findall(table)    if len(match_tr) != 2:        perror_and_exit("table format is incorrect")          icp_tr= match_tr[1]    pattern_td= re.compile(r".*?", re.DOTALL | re.MULTILINE)    match_td= pattern_td.findall(icp_tr)          #print match_td     company_name= get_text_from_html_tag(match_td[1])    company_properties= get_text_from_html_tag(match_td[2])    company_icp= get_text_from_html_tag(match_td[3])    company_icp= company_icp[company_icp.find(">")+ 1:]    company_website_name= get_text_from_html_tag(match_td[4])    company_website_home_page= get_text_from_html_tag(match_td[5])    company_website_home_page= company_website_home_page[company_website_home_page.rfind(">")+ 1:]    company_detail_url= get_text_from_html_tag(match_td[7])    pattern_href= re.compile(r"href=\".*?\"", re.DOTALL | re.MULTILINE)    match_href= pattern_href.findall(company_detail_url)    if len(match_href)== 0:        company_detail_url= ""    else:        company_detail_url= match_href[0][len("href=\""):-1]    return [url, company_name, company_properties, company_icp, company_website_name, company_website_home_page, company_detail_url]    pass  if __name__== "__main__":    fw= file("out.txt","w")    for urlin sys.stdin:        fw.write("\t".join(parse_alexa(url))+ "\n") #coding=gbkimport osimport sysimport reimport timeimport urllib2 def perror_and_exit(message, status= -1):    sys.stderr.write(message+ '\n')    sys.exit(status) def get_text_from_html_tag(html):    pattern_text=  re.compile(r">.*?   return pattern_text.findall(html)[0][1:-2].strip() def parse_alexa(url):    url_alexa= "http://icp.alexa.cn/index.php?q=%s" % url    print url_alexa    #handle exception    times= 0    while times <5000:#等待有一定次数限制        try:            alexa= urllib2.urlopen(url_alexa).read()             pattern_table= re.compile(r".*?", re.DOTALL | re.MULTILINE)            match_table= pattern_table.search(alexa)            if not match_table:                raise BaseException("No table in HTML")            break        except:            print "try %s times:sleep %s seconds" % (times,2**times)            times+= 1            time.sleep(2**times)            continue     table= match_table.group()    pattern_tr= re.compile(r".*?", re.DOTALL | re.MULTILINE)    match_tr= pattern_tr.findall(table)    if len(match_tr) != 2:        perror_and_exit("table format is incorrect")        icp_tr= match_tr[1]    pattern_td= re.compile(r".*?", re.DOTALL | re.MULTILINE)    match_td= pattern_td.findall(icp_tr)        #print match_td    company_name= get_text_from_html_tag(match_td[1])    company_properties= get_text_from_html_tag(match_td[2])    company_icp= get_text_from_html_tag(match_td[3])    company_icp= company_icp[company_icp.find(">")+ 1:]    company_website_name= get_text_from_html_tag(match_td[4])    company_website_home_page= get_text_from_html_tag(match_td[5])    company_website_home_page= company_website_home_page[company_website_home_page.rfind(">")+ 1:]    company_detail_url= get_text_from_html_tag(match_td[7])    pattern_href= re.compile(r"href=\".*?\"", re.DOTALL | re.MULTILINE)    match_href= pattern_href.findall(company_detail_url)    if len(match_href)== 0:        company_detail_url= ""    else:        company_detail_url= match_href[0][len("href=\""):-1]    return [url, company_name, company_properties, company_icp, company_website_name, company_website_home_page, company_detail_url]    pass if __name__== "__main__":    fw= file("out.txt","w")    for urlin sys.stdin:        fw.write("\t".join(parse_alexa(url))+ "\n")[python] view plaincopyprint? time.sleep(2)    pass  time.sleep(2)    pass | 
每次抓取都会sleep 2s,防止ip被封,实际上即使sleep了IP过一段时间还是会被封
由于是结构化抓取,当网站格式变化此程序将无法使用
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END
    











暂无评论内容