scraperwiki - How to scrape more than first instance of triple-nested list of links in Python? -


i trying determine simplest way record contents of webpages linked webpages linked original webpage. output table rows corresponding contents of third layer deep of pages.

as can see code, able first instance of desired item on third-level page. also, while current code return 1 row corresponding each h2 item on base url, hope have multiple rows per h2 item (as many there instances of "span.'case-doc-details' a" on second layer).

some additional info: @ each linking state, not know how many pages linked. using python , scraperwiki, , new both. have attempted research question, have hit roadblock in knowledge of ask. in advance help.

import scraperwiki import urlparse import lxml.html import urllib  def scrape_table(root):     rows = root.cssselect("h2")     record = {}     counter=0     row in rows:         table_cells = row.cssselect("h2 a")         cell in table_cells:             record['count']=counter             table_cellsurls = table_cells[0].cssselect("a")             record['caseurl'] = table_cellsurls[0].attrib.get('href')             caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()              #print caselinkurl             caseroots = lxml.html.fromstring(caselinkurl)             title=caseroots.cssselect("title")             record['title'] = title[0].text_content()             ids=caseroots.cssselect("div div div div a")             in ids:                 if len(ids)<=2:                     record['rules']="none"                     record['treaty']="none"                 else:                     record['rules']=ids[2].text_content()                     record['treaty']=ids[3].text_content()             pars = caseroots.cssselect("span.'case-doc-details' a")             #print "pars length is", len(pars)             caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()             caseroots2=lxml.html.fromstring(caselinkurl2)             #create table element rows, marked off case came from, create rows.             in pars:                      if len(pars)==0:                     record['detailsurl']="none"                 else:                                         record['detailsurl']=pars[0].attrib.get('href')                 pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")                 if len(pars2)==0:                     record['doc date']="none"                 else:                                             record['doc date']=pars2[0].text_content()                 pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")                 if len(pars3) ==0:                     record['doc type link']="none"                     record['doc type']="none"                   else:                     record['doc type link']=pars3[0].attrib.get('href')                     record['doc type']=pars3[0].text_content()                 pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")                 if len(pars4)==0:                     record['claimant nominee']="none"                 else:                     record['claimant nominee']=pars4[0].text_content()                 pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")                 if len(pars5)==0:                     record['respondent nominee']="none"                 else:                     record['respondent nominee']=pars5[0].text_content()                 pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")                 if len(pars6)==0:                     record['president']="none"                 else:                     record['president']=pars6[0].text_content()              print record, '------------'             scraperwiki.sqlite.save(['count'],record)             counter+=1 def scrape_and_look_for_next_link(url):     html = scraperwiki.scrape(url)     print html     root = lxml.html.fromstring(html)     scrape_table(root)   #start here: url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=all' scrape_and_look_for_next_link(url) 

here's code i've got far - doesn't yet grab documents link data (or save anything), should case of extending principles here function:

import scraperwiki import urlparse import lxml.html import urllib  def scrape_page(linkurl):     html = scraperwiki.scrape(linkurl)     root = lxml.html.fromstring(html)     title = root.cssselect("h1")     print "the title:", title[0].text     record = {}     record['title'] = title[0].text     record['url'] = linkurl     #<div class="field-items"><div class="field-item even"><a     arbrules = root.cssselect("div.field-items a")     if arbrules:         record['arbruleurl'] = arbrules[0].attrib.get("href")         record['arbrule'] = arbrules[0].text_content()     else:         record['arbruleurl'] = "no url"         record['arbrule'] = "no arbrule"     legalbasis = root.cssselect("div.field-label")     if legalbasis:         record['legalbasis'] = legalbasis[0].text_content()     else:         record['legalbasis'] = "no legal basis given"     extralinks = []     contents = root.cssselect("div.view-content a")     if contents:         content in contents:             extralinks.append(content.text_content())             extralinks.append(content.attrib.get("href"))         record['extralinks']  = extralinks     else:         record['extralinks']  = "no links"     #record['firstparty'] = title[0].text.split(" v. ")[0]     #record['secondparty'] = title[0].text.split(" v. ")[1]     #record['casenumber'] = title[0].text.split(" case no.")[1]     print record   def scrape_table(root):     links = root.cssselect("div.link-wrapper a")     link in links:         print link.text_content()         linkurl = link.attrib.get("href")         print linkurl         scrape_page('http://www.italaw.com'+linkurl)  def scrape_and_look_for_next_link(url):     html = scraperwiki.scrape(url)     print html     root = lxml.html.fromstring(html)     scrape_table(root)   #start here: url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=all' scrape_and_look_for_next_link(url) 

Comments