In [None]:
# Display as slides with the Jupyter notebook RISE extension
# https://github.com/damianavila/RISE
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
              'theme': 'sans-serif',
              'transition': 'default',
              'start_slideshow_at': 'selected',
})

# Intro to Web Scraping


&nbsp;

### Matt Bauman
#### July 6, 2016

# What is HTML?

* Human *and* machine-readable text
* Supposed to be the semantic structure of a document

* Horribly abused
* Often terribly malformed
* Frequently unreadable by humans and just barely readable by machines

* It's a ~~miracle~~ ton of effort that makes browsers work at all

# Okay, but *what is it*?

* Plain-text markup that wraps content in **tags**
* Tags are marked in brackets like `<body>`
* And everything that follows is considered part of `body` until it's closed with a `</body>`.

* Tags can be nested
* Can be closed immediately without enclosing any content `<div />`.
* Can have attributes to modify their behavior or name them

In [None]:
from IPython.core.display import display, HTML
display(HTML('<p style="color:red;">Hello, world</h1>'))

In [None]:
import requests
#print(requests.get('http://www.nytimes.com/').text)

# Important tags for scraping

* `div` - major sections
* `table` - broken down into `tr` (rows) and `td` (datum)
* `form` - contains `input` tags that get submitted
* `ul`/`ol` - lists (ordered and unordered), contains `li` (list items)

# Important attributes for scraping

* `id` and `class`
* They *name* tags; web developers use these names for styling and interactivity
* `id`s are unique; `class`es are groups

# Why web scraping is terrible

### Invalid pages and incompatibilities

* w3c (WWW Consortium) sets standards for HTML, CSS, XML, etc.
* They have [a validator](https://validator.w3.org) to ensure that pages meet their specs

### HTML can be extremely hard to read

* Fortunately, web inspector tools can make your life easier
* Check out [The NY Times](http://www.nytimes.com/) in the browser

### Some sites require javascript to work

* There aren't any libraries (that I'm aware of) that implement Javascript
* Try turning off Javascript in your browser and make sure the site still works
* You can often *emulate* the Javascript code to make the same requests... but it's a pain

### It's fragile 

* While the *markup* is machine readable, that just specifies page layout
* The *same content* can be coded in HTML in an infinite number of ways and still look identical
* Web authors can change their code at any point...

* **and still look very similar**. [An extreme example](https://web.archive.org/web/20001109144000/http://www1.nytimes.com/)

# Working around the terrible-ness

* Don't worry about parsing yourself -- no regexes or string searches!
* Don't worry about traversing individual nested levels (e.g., inside two divs and ...)

### Instead...

* Think of each webpage as a "tag soup"
* Try to find a way to describe the tags you're looking for in a minimal way
* And use a good library

# Scraping in five lines:

In [4]:
# Look for headlines in the NYTimes
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.nytimes.com/')
soup = BeautifulSoup(r.text)
tags = soup.find_all(attrs={'class': 'story-heading'})


In [5]:
for tag in tags: display(HTML(str(tag)))

# Hedging your bets

* There are lots of ways to specify a search through the tag soup
* Some methods may be more robust than others...
* But it's not worth spending too much time trying to out-wit whatever might be updating the site on the other side

In [6]:
# Another way to get the headlines
articles = soup.find_all('article')
import re
[article.find_all(re.compile('^h\d')) for article in articles]

[[<h2 class="css-km70tz esl82me0">Listen to ‘The Daily’</h2>],
 [<h2 class="css-km70tz esl82me0">The Daily Mini Crossword</h2>],
 [<h2 class="css-km70tz esl82me0">Got a confidential news tip?</h2>],
 [<h2 class="css-1qwxefa esl82me0"><span>In China, Some Fear the End of ‘Chimerica’</span></h2>],
 [<h2 class="css-n2blzn esl82me0">Why the U.S.-China Trade War Could Be Long and Painful</h2>],
 [<h2 class="css-1qwxefa esl82me0"><span>White House Reviews Military Plans Against Iran, in Echoes of Iraq War</span></h2>],
 [<h2 class="css-1qwxefa esl82me0"><span>How House Democrats in Key Districts Plan to Keep Their Seats</span></h2>],
 [<h2 class="css-n2blzn esl82me0">Elizabeth Warren refused to participate in a town hall on Fox News, which she called a “a hate-for-profit racket.”</h2>],
 [<h2 class="css-n2blzn esl82me0">Beto O’Rourke tried to reset his flagging campaign, saying on “The View” that he regretted his “born to be in it” comment.</h2>],
 [<h2 class="css-1qwxefa esl82me0"><span>Orb

# Advanced topics: HTTP

* HTTP specifies *how* you ask for and retrieve content
* Also specifies metadata in headers that control caching, redirects, sessions, and more

In [7]:
r = requests.get('http://google.com/')
r.headers

{'Date': 'Tue, 14 May 2019 19:14:48 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'Content-Length': '4900', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2019-05-14-19; expires=Thu, 13-Jun-2019 19:14:48 GMT; path=/; domain=.google.com, NID=183=FUUJhrZssgRPnIV2AIR2bX1hnftGj3H4O_97-UaZLCwakWFN0geeMv8dUCz0adnmV-V1_Lg058gyxApxOhTe9RSBs7S3L2K2FpVGy4p3kPYndj8CU-GYoEHwPHF1SZZPRrfJBcL1GMbd0H4J-ChraOM_8ha4mkaUeLvxzgLKcbg; expires=Wed, 13-Nov-2019 19:14:48 GMT; path=/; domain=.google.com; HttpOnly'}

# Searches and forms

* Typically, the most interesting things to scrape are hidden behind searches and forms
* How do you enter text into Google's search box via Python?

In [9]:
soup = BeautifulSoup(requests.get('http://google.com').text)
print(soup.find('form').prettify())

<form action="/search" name="f">
 <table cellpadding="0" cellspacing="0">
  <tr valign="top">
   <td width="25%">
   </td>
   <td align="center" nowrap="">
    <input name="ie" type="hidden" value="ISO-8859-1"/>
    <input name="hl" type="hidden" value="en"/>
    <input name="source" type="hidden" value="hp"/>
    <input name="biw" type="hidden"/>
    <input name="bih" type="hidden"/>
    <div class="ds" style="height:32px;margin:4px 0">
     <input autocomplete="off" class="lst" maxlength="2048" name="q" size="57" style="color:#000;margin:0;padding:5px 8px 0 6px;vertical-align:top" title="Google Search" value=""/>
    </div>
    <br style="line-height:0"/>
    <span class="ds">
     <span class="lsbb">
      <input class="lsb" name="btnG" type="submit" value="Google Search"/>
     </span>
    </span>
    <span class="ds">
     <span class="lsbb">
      <input class="lsb" name="btnI" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'" type="submit" value="I'm F

In [10]:
r = requests.get('http://google.com/search', 
                 params={'q':  'how long does a walrus live?',
                         'btnI': "I'm Feeling Lucky"})


# Types of requests

* `requests.get` is actually doing a `GET`
    * It encodes the parameters (if any) directly into the url: `?param=value&param2=value2...`
    * This means that it gets *saved into your browser history*
    * Back buttons, refresh may send the same parameters again

### Other HTTP verbs:

* `POST` is the other most common method
    * Just like `GET`, except that it sends its parameters hidden in a header
    * Often used for purchases, posts, etc, that you don't want to submit twice
* There's [others](https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol#Request_methods) (`PUT`, `DELETE`, `HEAD`, ...), but they're rarer

# A slightly more complicated example

* Let's look for satellites! [heavens-above.com](http://heavens-above.com)

In [11]:
# Scrape the times that the ISS is visible
r = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')
def scrape_times(text):
    soup = BeautifulSoup(text)
    rows = soup.find_all('tr', attrs={'class':'clickableRow'})
    times = []
    for row in rows:
        cols = row.find_all('td')
        times.append(cols[0].text + ' ' + cols[2].text)
    return times
scrape_times(r.text)

['14 May 02:19:24',
 '14 May 03:55:48',
 '15 May 01:29:57',
 '15 May 03:04:56',
 '15 May 04:41:09',
 '16 May 00:40:20',
 '16 May 02:13:58',
 '16 May 03:50:22',
 '17 May 01:23:02',
 '17 May 02:59:33',
 '17 May 04:35:50',
 '17 May 21:16:59',
 '17 May 22:53:16',
 '18 May 00:31:29',
 '18 May 02:08:43',
 '18 May 03:44:55',
 '18 May 22:02:06',
 '18 May 23:40:02',
 '19 May 01:17:47',
 '19 May 02:54:03',
 '19 May 04:30:48',
 '19 May 21:11:07',
 '19 May 22:48:33',
 '20 May 00:26:41',
 '20 May 02:03:13',
 '20 May 03:39:36',
 '20 May 21:57:08',
 '20 May 23:35:24',
 '21 May 01:12:21',
 '21 May 21:05:50',
 '21 May 22:43:57',
 '22 May 00:21:25',
 '22 May 21:52:26',
 '22 May 23:30:23',
 '23 May 01:06:45',
 '23 May 21:00:56',
 '23 May 22:39:09']

In [18]:
# Get the next page
r = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')
def get_next_page(r):
    soup = BeautifulSoup(r.text)
    inputs = soup.find_all('input')
    d = {input.attrs['name']: input.attrs['value'] for input in inputs}
    d.pop('ctl00$cph1$btnPrev')
    d['ctl00_cph1_radioAll'] = 'radioVisible'
    from urllib.parse import urlparse, urljoin
    url = urljoin(r.url, soup.find('form').attrs['action'])
    return requests.post(url, d)
scrape_times(get_next_page(r).text)

['24 May 00:15:52',
 '24 May 01:52:08',
 '24 May 18:33:13',
 '24 May 20:09:31',
 '24 May 21:47:44',
 '24 May 23:24:56',
 '25 May 01:01:07',
 '25 May 02:38:38',
 '25 May 17:43:09',
 '25 May 19:18:15',
 '25 May 20:56:11',
 '25 May 22:33:55',
 '26 May 00:10:11',
 '26 May 01:46:56',
 '26 May 18:27:11',
 '26 May 20:04:38',
 '26 May 21:42:44',
 '26 May 23:19:16',
 '27 May 00:55:38',
 '27 May 17:36:24',
 '27 May 19:13:07',
 '27 May 20:51:23',
 '27 May 22:28:20',
 '28 May 00:04:32',
 '28 May 01:42:53',
 '28 May 16:45:58',
 '28 May 18:21:44',
 '28 May 19:59:51',
 '28 May 21:37:19',
 '28 May 23:13:31',
 '29 May 00:50:34',
 '29 May 15:56:24',
 '29 May 17:30:32',
 '29 May 19:08:15',
 '29 May 20:46:11',
 '29 May 22:22:33',
 '29 May 23:59:05',
 '30 May 16:39:33',
 '30 May 18:16:39',
 '30 May 19:54:52',
 '30 May 21:31:36',
 '30 May 23:07:52',
 '31 May 15:48:53',
 '31 May 17:25:10',
 '31 May 19:03:22',
 '31 May 20:40:35',
 '31 May 22:16:46',
 '31 May 23:54:15',
 '01 Jun 14:58:43',
 '01 Jun 16:33:49',


In [21]:
# Get the next 10 pages!
from tqdm import tqdm
r = requests.get('http://heavens-above.com/PassSummary.aspx?satid=25544&lat=41.8781&lng=-87.6298&loc=Chicago&alt=181&tz=CST')

times = []
for i in tqdm(range(10)):
    times.extend(scrape_times(r.text))
    r = get_next_page(r)
times

100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


['14 May 02:19:24',
 '14 May 03:55:48',
 '15 May 01:29:57',
 '15 May 03:04:56',
 '15 May 04:41:09',
 '16 May 00:40:20',
 '16 May 02:13:58',
 '16 May 03:50:22',
 '17 May 01:23:02',
 '17 May 02:59:33',
 '17 May 04:35:50',
 '17 May 21:16:59',
 '17 May 22:53:16',
 '18 May 00:31:29',
 '18 May 02:08:43',
 '18 May 03:44:55',
 '18 May 22:02:06',
 '18 May 23:40:02',
 '19 May 01:17:47',
 '19 May 02:54:03',
 '19 May 04:30:48',
 '19 May 21:11:07',
 '19 May 22:48:33',
 '20 May 00:26:41',
 '20 May 02:03:13',
 '20 May 03:39:36',
 '20 May 21:57:08',
 '20 May 23:35:24',
 '21 May 01:12:21',
 '21 May 21:05:50',
 '21 May 22:43:57',
 '22 May 00:21:25',
 '22 May 21:52:26',
 '22 May 23:30:23',
 '23 May 01:06:45',
 '23 May 21:00:56',
 '23 May 22:39:09',
 '24 May 00:15:52',
 '24 May 01:52:08',
 '24 May 18:33:13',
 '24 May 20:09:31',
 '24 May 21:47:44',
 '24 May 23:24:56',
 '25 May 01:01:07',
 '25 May 02:38:38',
 '25 May 17:43:09',
 '25 May 19:18:15',
 '25 May 20:56:11',
 '25 May 22:33:55',
 '26 May 00:10:11',
