EVOLUTION-MANAGER
Edit File: rvest.html
<!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta name="generator" content="pandoc" /> <meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <title>Web scraping 101</title> <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); </script> <style type="text/css"> code{white-space: pre-wrap;} span.smallcaps{font-variant: small-caps;} span.underline{text-decoration: underline;} div.column{display: inline-block; vertical-align: top; width: 50%;} div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} ul.task-list{list-style: none;} </style> <style type="text/css"> code { white-space: pre; } .sourceCode { overflow: visible; } </style> <style type="text/css" data-origin="pandoc"> pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } .sourceCode { overflow: visible; } code.sourceCode > span { color: inherit; text-decoration: inherit; } div.sourceCode { margin: 1em 0; } pre.sourceCode { margin: 0; } @media screen { div.sourceCode { overflow: auto; } } @media print { pre > code.sourceCode { white-space: pre-wrap; } pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } } pre.numberSource code { counter-reset: source-line 0; } pre.numberSource code > span { position: relative; left: -4em; counter-increment: source-line; } pre.numberSource code > span > a:first-child::before { content: counter(source-line); position: relative; left: -1em; text-align: right; vertical-align: baseline; border: none; display: inline-block; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; color: #aaaaaa; } pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } code span.al { color: #ff0000; font-weight: bold; } /* Alert */ code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */ code span.at { color: #7d9029; } /* Attribute */ code span.bn { color: #40a070; } /* BaseN */ code span.bu { } /* BuiltIn */ code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */ code span.ch { color: #4070a0; } /* Char */ code span.cn { color: #880000; } /* Constant */ code span.co { color: #60a0b0; font-style: italic; } /* Comment */ code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */ code span.do { color: #ba2121; font-style: italic; } /* Documentation */ code span.dt { color: #902000; } /* DataType */ code span.dv { color: #40a070; } /* DecVal */ code span.er { color: #ff0000; font-weight: bold; } /* Error */ code span.ex { } /* Extension */ code span.fl { color: #40a070; } /* Float */ code span.fu { color: #06287e; } /* Function */ code span.im { } /* Import */ code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */ code span.kw { color: #007020; font-weight: bold; } /* Keyword */ code span.op { color: #666666; } /* Operator */ code span.ot { color: #007020; } /* Other */ code span.pp { color: #bc7a00; } /* Preprocessor */ code span.sc { color: #4070a0; } /* SpecialChar */ code span.ss { color: #bb6688; } /* SpecialString */ code span.st { color: #4070a0; } /* String */ code span.va { color: #19177c; } /* Variable */ code span.vs { color: #4070a0; } /* VerbatimString */ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */ </style> <script> // apply pandoc div.sourceCode style to pre.sourceCode instead (function() { var sheets = document.styleSheets; for (var i = 0; i < sheets.length; i++) { if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue; try { var rules = sheets[i].cssRules; } catch (e) { continue; } var j = 0; while (j < rules.length) { var rule = rules[j]; // check if there is a div.sourceCode rule if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") { j++; continue; } var style = rule.style.cssText; // check if color or background-color is set if (rule.style.color === '' && rule.style.backgroundColor === '') { j++; continue; } // replace div.sourceCode by a pre.sourceCode rule sheets[i].deleteRule(j); sheets[i].insertRule('pre.sourceCode{' + style + '}', j); } } })(); </script> <style type="text/css">body { background-color: #fff; margin: 1em auto; max-width: 700px; overflow: visible; padding-left: 2em; padding-right: 2em; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; font-size: 14px; line-height: 1.35; } #TOC { clear: both; margin: 0 0 10px 10px; padding: 4px; width: 400px; border: 1px solid #CCCCCC; border-radius: 5px; background-color: #f6f6f6; font-size: 13px; line-height: 1.3; } #TOC .toctitle { font-weight: bold; font-size: 15px; margin-left: 5px; } #TOC ul { padding-left: 40px; margin-left: -1.5em; margin-top: 5px; margin-bottom: 5px; } #TOC ul ul { margin-left: -2em; } #TOC li { line-height: 16px; } table { margin: 1em auto; border-width: 1px; border-color: #DDDDDD; border-style: outset; border-collapse: collapse; } table th { border-width: 2px; padding: 5px; border-style: inset; } table td { border-width: 1px; border-style: inset; line-height: 18px; padding: 5px 5px; } table, table th, table td { border-left-style: none; border-right-style: none; } table thead, table tr.even { background-color: #f7f7f7; } p { margin: 0.5em 0; } blockquote { background-color: #f6f6f6; padding: 0.25em 0.75em; } hr { border-style: solid; border: none; border-top: 1px solid #777; margin: 28px 0; } dl { margin-left: 0; } dl dd { margin-bottom: 13px; margin-left: 13px; } dl dt { font-weight: bold; } ul { margin-top: 0; } ul li { list-style: circle outside; } ul ul { margin-bottom: 0; } pre, code { background-color: #f7f7f7; border-radius: 3px; color: #333; white-space: pre-wrap; } pre { border-radius: 3px; margin: 5px 0px 10px 0px; padding: 10px; } pre:not([class]) { background-color: #f7f7f7; } code { font-family: Consolas, Monaco, 'Courier New', monospace; font-size: 85%; } p > code, li > code { padding: 2px 0px; } div.figure { text-align: center; } img { background-color: #FFFFFF; padding: 2px; border: 1px solid #DDDDDD; border-radius: 3px; border: 1px solid #CCCCCC; margin: 0 5px; } h1 { margin-top: 0; font-size: 35px; line-height: 40px; } h2 { border-bottom: 4px solid #f7f7f7; padding-top: 10px; padding-bottom: 2px; font-size: 145%; } h3 { border-bottom: 2px solid #f7f7f7; padding-top: 10px; font-size: 120%; } h4 { border-bottom: 1px solid #f7f7f7; margin-left: 8px; font-size: 105%; } h5, h6 { border-bottom: 1px solid #ccc; font-size: 105%; } a { color: #0033dd; text-decoration: none; } a:hover { color: #6666ff; } a:visited { color: #800080; } a:visited:hover { color: #BB00BB; } a[href^="http:"] { text-decoration: underline; } a[href^="https:"] { text-decoration: underline; } code > span.kw { color: #555; font-weight: bold; } code > span.dt { color: #902000; } code > span.dv { color: #40a070; } code > span.bn { color: #d14; } code > span.fl { color: #d14; } code > span.ch { color: #d14; } code > span.st { color: #d14; } code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } </style> </head> <body> <h1 class="title toc-ignore">Web scraping 101</h1> <p>This vignette introduces you to the basics of web scraping with rvest. You’ll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you’ll learn how to use rvest functions to get data out of HTML and into R.</p> <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(rvest)</span></code></pre></div> <div id="html-basics" class="section level2"> <h2>HTML basics</h2> <p>HTML stands for “HyperText Markup Language” and looks like this:</p> <div class="sourceCode" id="cb2"><pre class="sourceCode html"><code class="sourceCode html"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="kw"><html></span></span> <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw"><head></span></span> <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> <span class="kw"><title></span>Page title<span class="kw"></title></span></span> <span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="kw"></head></span></span> <span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="kw"><body></span></span> <span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a> <span class="kw"><h1</span> <span class="er">id</span><span class="ot">=</span><span class="st">'first'</span><span class="kw">></span>A heading<span class="kw"></h1></span></span> <span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a> <span class="kw"><p></span>Some text <span class="dv">&amp;</span> <span class="kw"><b></span>some bold text.<span class="kw"></b></p></span></span> <span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a> <span class="kw"><img</span> <span class="er">src</span><span class="ot">=</span><span class="st">'myimg.png'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'100'</span> <span class="er">height</span><span class="ot">=</span><span class="st">'100'</span><span class="kw">></span></span> <span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="kw"></body></span></span></code></pre></div> <p>HTML has a hierarchical structure formed by <strong>elements</strong> which consist of a start tag (e.g. <code><tag></code>), optional <strong>attributes</strong> (<code>id='first'</code>), an end tag<a href="#fn1" class="footnote-ref" id="fnref1"><sup>1</sup></a> (like <code></tag></code>), and <strong>contents</strong> (everything in between the start and end tag).</p> <p>Since <code><</code> and <code>></code> are used for start and end tags, you can’t write them directly. Instead you have to use the HTML <strong>escapes</strong> <code>&gt;</code> (greater than) and <code>&lt;</code> (less than). And since those escapes use <code>&</code>, if you want a literal ampersand you have to escape it as <code>&amp;</code>. There are a wide range of possible HTML escapes but you don’t need to worry about them too much because rvest automatically handles them for you.</p> <div id="elements" class="section level3"> <h3>Elements</h3> <p>All up, there are over 100 HTML elements. Some of the most important are:</p> <ul> <li><p>Every HTML page must be must be in an <code><html></code> element, and it must have two children: <code><head></code>, which contains document metadata like the page title, and <code><body></code>, which contains the content you see in the browser.</p></li> <li><p>Block tags like <code><h1></code> (heading 1), <code><p></code> (paragraph), and <code><ol></code> (ordered list) form the overall structure of the page.</p></li> <li><p>Inline tags like <code><b></code> (bold), <code><i></code> (italics), and <code><a></code> (links) formats text inside block tags.</p></li> </ul> <p>If you encounter a tag that you’ve never seen before, you can find out what it does with a little googling. I recommend the <a href="https://developer.mozilla.org/en-US/docs/Web/HTML">MDN Web Docs</a> which are produced by Mozilla, the company that makes the Firefox web browser.</p> </div> <div id="contents" class="section level3"> <h3>Contents</h3> <p>Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold.</p> <p> Hi! My <b>name</b> is Hadley. </p> <p>The <strong>children</strong> of a node refers only to elements, so the <code><p></code> element above has one child, the <code><b></code> element. The <code><b></code> element has no children, but it does have contents (the text “name”).</p> <p>Some elements, like <code><img></code> can’t have children. These elements depend solely on attributes for their behavior.</p> </div> <div id="attributes" class="section level3"> <h3>Attributes</h3> <p>Tags can have named <strong>attributes</strong> which look like <code>name1='value1' name2='value2'</code>. Two of the most important attributes are <code>id</code> and <code>class</code>, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page.</p> </div> </div> <div id="reading-html-with-rvest" class="section level2"> <h2>Reading HTML with rvest</h2> <p>You’ll usually start the scraping process with <code>read_html()</code>. This returns a <code>xml_document</code><a href="#fn2" class="footnote-ref" id="fnref2"><sup>2</sup></a> object which you’ll then manipulate using rvest functions:</p> <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">read_html</span>(<span class="st">"http://rvest.tidyverse.org/"</span>)</span> <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">class</span>(html)</span> <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "xml_document" "xml_node"</span></span></code></pre></div> <p>For examples and experimentation, rvest also includes a function that lets you create an <code>xml_document</code> from literal HTML:</p> <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"</span></span> <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <p>This is a paragraph<p></span></span> <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="st"> <ul></span></span> <span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="st"> <li>This is a bulleted list</li></span></span> <span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="st"> </ul></span></span> <span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="st">"</span>)</span> <span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>html</span> <span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> {html_document}</span></span> <span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> <html></span></span> <span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...</span></span> <span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2] <body>\n<p>This is a paragraph</p>\n<p>\n </p>\n<ul>\n<li>This is a bull ...</span></span></code></pre></div> <p>Regardless of how you get the HTML, you’ll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I’ll focus on CSS selectors because they’re simpler but still sufficiently powerful for most scraping tasks.</p> </div> <div id="css-selectors" class="section level2"> <h2>CSS selectors</h2> <p>CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called <strong>CSS selectors</strong>. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract.</p> <p>CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are:</p> <ul> <li><p><code>p</code>: selects all <code><p></code> elements.</p></li> <li><p><code>.title</code>: selects all elements with <code>class</code> “title”.</p></li> <li><p><code>p.special</code>: selects all <code><p></code> elements with <code>class</code> “special”.</p></li> <li><p><code>#title</code>: selects the element with the <code>id</code> attribute that equals “title”. Id attributes must be unique within a document, so this will only ever select a single element.</p></li> </ul> <p>If you want to learn more CSS selectors I recommend starting with the fun <a href="https://flukeout.github.io/">CSS dinner</a> tutorial and then referring to the <a href="https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors">MDN web docs</a>.</p> <p>Lets try out the most important selectors with a simple example:</p> <div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"</span></span> <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <h1>This is a heading</h1></span></span> <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="st"> <p id='first'>This is a paragraph</p></span></span> <span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="st"> <p class='important'>This is an important paragraph</p></span></span> <span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="st">"</span>)</span></code></pre></div> <p>In rvest you can extract a single element with <code>html_element()</code> or all matching elements with <code>html_elements()</code>. Both functions take a document<a href="#fn3" class="footnote-ref" id="fnref3"><sup>3</sup></a> and a css selector:</p> <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">"h1"</span>)</span> <span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> {html_node}</span></span> <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co">#> <h1></span></span> <span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">"p"</span>)</span> <span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> {xml_nodeset (2)}</span></span> <span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] <p id="first">This is a paragraph</p></span></span> <span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> [2] <p class="important">This is an important paragraph</p></span></span> <span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">".important"</span>)</span> <span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> {xml_nodeset (1)}</span></span> <span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] <p class="important">This is an important paragraph</p></span></span> <span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">"#first"</span>)</span> <span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co">#> {xml_nodeset (1)}</span></span> <span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] <p id="first">This is a paragraph</p></span></span></code></pre></div> <p>Selectors can also be combined in various ways using <strong>combinators</strong>. For example,The most important combinator is ” “, the <strong>descendant</strong> combination, because <code>p a</code> selects all <code><a></code> elements that are a child of a <code><p></code> element.</p> <p>If you don’t know exactly what selector you need, I highly recommend using <a href="https://rvest.tidyverse.org/articles/selectorgadget.html">SelectorGadget</a>, which lets you automatically generate the selector you need by supplying positive and negative examples in the browser.</p> </div> <div id="extracting-data" class="section level2"> <h2>Extracting data</h2> <p>Now that you’ve got the elements you care about, you’ll need to get data out of them. You’ll usually get the data from either the text contents or an attribute. But, sometimes (if you’re lucky!), the data you need will be in an HTML table.</p> <div id="text" class="section level3"> <h3>Text</h3> <p>Use <code>html_text2()</code> to extract the plain text contents of an HTML element:</p> <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"</span></span> <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <ol></span></span> <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="st"> <li>apple &amp; pear</li></span></span> <span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="st"> <li>banana</li></span></span> <span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a><span class="st"> <li>pineapple</li></span></span> <span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="st"> </ol></span></span> <span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a><span class="st">"</span>)</span> <span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_elements</span>(<span class="st">"li"</span>) <span class="sc">%>%</span> </span> <span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_text2</span>()</span> <span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "apple & pear" "banana" "pineapple"</span></span></code></pre></div> <p>Note that the escaped ampersand is automatically converted to <code>&</code>; you’ll only ever see HTML escapes in the source HTML, not in the data returned by rvest.</p> <p>You might wonder why I used <code>html_text2()</code>, since it seems to give the same result as <code>html_text()</code>:</p> <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_elements</span>(<span class="st">"li"</span>) <span class="sc">%>%</span> </span> <span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_text</span>()</span> <span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "apple & pear" "banana" "pineapple"</span></span></code></pre></div> <p>The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it’s the structure of the elements that defines how text is laid out. <code>html_text2()</code> does its best to follow the same rules, giving you something similar to what you’d see in the browser. Take this example which contains a bunch of white space that HTML ignores.</p> <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"<body></span></span> <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <p></span></span> <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="st"> This is</span></span> <span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="st"> a</span></span> <span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a><span class="st"> paragraph.</p><p>This is another paragraph.</span></span> <span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a><span class="st"> </span></span> <span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a><span class="st"> It has two sentences.</p></span></span> <span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a><span class="st">"</span>)</span></code></pre></div> <p><code>html_text2()</code> gives you what you expect: two paragraphs of text separated by a blank line.</p> <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_element</span>(<span class="st">"body"</span>) <span class="sc">%>%</span> </span> <span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_text2</span>() <span class="sc">%>%</span> </span> <span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">cat</span>()</span> <span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> This is a paragraph.</span></span> <span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span> <span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> This is another paragraph. It has two sentences.</span></span></code></pre></div> <p>Whereas <code>html_text()</code> returns the garbled raw underlying text:</p> <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_element</span>(<span class="st">"body"</span>) <span class="sc">%>%</span> </span> <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_text</span>() <span class="sc">%>%</span> </span> <span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">cat</span>()</span> <span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span> <span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span> <span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> This is</span></span> <span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> a</span></span> <span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> paragraph.This is another paragraph.</span></span> <span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> </span></span> <span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a><span class="co">#> It has two sentences.</span></span></code></pre></div> </div> <div id="attributes-1" class="section level3"> <h3>Attributes</h3> <p>Attributes are used to record the destination of links (the <code>href</code> attribute of <code><a></code> elements) and the source of images (the <code>src</code> attribute of the <code><img></code> element):</p> <div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"</span></span> <span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <p><a href='https://en.wikipedia.org/wiki/Cat'>cats</a></p></span></span> <span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="st"> <img src='https://cataas.com/cat' width='100' height='200'></span></span> <span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="st">"</span>)</span></code></pre></div> <p>The value of an attribute can be retrieved with <code>html_attr()</code>:</p> <div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_elements</span>(<span class="st">"a"</span>) <span class="sc">%>%</span> </span> <span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_attr</span>(<span class="st">"href"</span>)</span> <span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "https://en.wikipedia.org/wiki/Cat"</span></span> <span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a></span> <span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_elements</span>(<span class="st">"img"</span>) <span class="sc">%>%</span> </span> <span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_attr</span>(<span class="st">"src"</span>)</span> <span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "https://cataas.com/cat"</span></span></code></pre></div> <p>Note that <code>html_attr()</code> always returns a string, so you may need to post-process with <code>as.integer()</code>/<code>readr::parse_integer()</code> or similar.</p> <div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_elements</span>(<span class="st">"img"</span>) <span class="sc">%>%</span> </span> <span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_attr</span>(<span class="st">"width"</span>)</span> <span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "100"</span></span> <span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a></span> <span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_elements</span>(<span class="st">"img"</span>) <span class="sc">%>%</span> </span> <span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_attr</span>(<span class="st">"width"</span>) <span class="sc">%>%</span> </span> <span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">as.integer</span>()</span> <span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] 100</span></span></code></pre></div> </div> <div id="tables" class="section level3"> <h3>Tables</h3> <p>HTML tables are composed four main elements: <code><table></code>, <code><tr></code> (table row), <code><th></code> (table heading), and <code><td></code> (table data). Here’s a simple HTML table with two columns and three rows:</p> <div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"</span></span> <span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <table></span></span> <span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="st"> <tr></span></span> <span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="st"> <th>x</th></span></span> <span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="st"> <th>y</th></span></span> <span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="st"> </tr></span></span> <span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="st"> <tr></span></span> <span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="st"> <td>1.5</td></span></span> <span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="st"> <td>2.7</td></span></span> <span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="st"> </tr></span></span> <span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a><span class="st"> <tr></span></span> <span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a><span class="st"> <td>4.9</td></span></span> <span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a><span class="st"> <td>1.3</td></span></span> <span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="st"> </tr></span></span> <span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a><span class="st"> <tr></span></span> <span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a><span class="st"> <td>7.2</td></span></span> <span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a><span class="st"> <td>8.1</td></span></span> <span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a><span class="st"> </tr></span></span> <span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a><span class="st"> </table></span></span> <span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a><span class="st"> "</span>)</span></code></pre></div> <p>Because tables are a common way to store data, rvest includes the handy <code>html_table()</code> which converts a table into a data frame:</p> <div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> </span> <span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_node</span>(<span class="st">"table"</span>) <span class="sc">%>%</span> </span> <span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">html_table</span>()</span> <span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> # A tibble: 3 × 2</span></span> <span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="co">#> x y</span></span> <span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> <dbl> <dbl></span></span> <span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> 1 1.5 2.7</span></span> <span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> 2 4.9 1.3</span></span> <span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> 3 7.2 8.1</span></span></code></pre></div> </div> </div> <div id="element-vs-elements" class="section level2"> <h2>Element vs elements</h2> <p>When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using <code>html_elements()</code> to select the elements that contain each observation then use <code>html_element()</code> to extract the variables from each observation. This guarantees that you’ll get the same number of values for each variable because <code>html_element()</code> always returns the same number of outputs as inputs.</p> <p>To illustrate this problem take a look at this simple example I constructed using a few entries from <code>dplyr::starwars</code>:</p> <div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>html <span class="ot"><-</span> <span class="fu">minimal_html</span>(<span class="st">"</span></span> <span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="st"> <ul></span></span> <span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="st"> <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li></span></span> <span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="st"> <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li></span></span> <span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="st"> <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li></span></span> <span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a><span class="st"> <li><b>R4-P17</b> is a <i>droid</i></li></span></span> <span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a><span class="st"> </ul></span></span> <span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a><span class="st"> "</span>)</span></code></pre></div> <p>If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them:</p> <div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">"b"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "C-3PO" "R2-D2" "Yoda" "R4-P17"</span></span> <span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">"i"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "droid" "droid" "droid"</span></span> <span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">".weight"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "167 kg" "96 kg" "66 kg"</span></span></code></pre></div> <p>Instead, use <code>html_elements()</code> to find a element that corresponds to each character, then use <code>html_element()</code> to extract each variable for all observations:</p> <div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>characters <span class="ot"><-</span> html <span class="sc">%>%</span> <span class="fu">html_elements</span>(<span class="st">"li"</span>)</span> <span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a></span> <span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>characters <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">"b"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "C-3PO" "R2-D2" "Yoda" "R4-P17"</span></span> <span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>characters <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">"i"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "droid" "droid" NA "droid"</span></span> <span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a>characters <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">".weight"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb19-8"><a href="#cb19-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> [1] "167 kg" "96 kg" "66 kg" NA</span></span></code></pre></div> <p><code>html_element()</code> automatically fills in <code>NA</code> when no elements match, keeping all of the variables aligned and making it easy to create a data frame:</p> <div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">data.frame</span>(</span> <span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a> <span class="at">name =</span> characters <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">"b"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>(),</span> <span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a> <span class="at">species =</span> characters <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">"i"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>(),</span> <span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a> <span class="at">weight =</span> characters <span class="sc">%>%</span> <span class="fu">html_element</span>(<span class="st">".weight"</span>) <span class="sc">%>%</span> <span class="fu">html_text2</span>()</span> <span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>)</span> <span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a><span class="co">#> name species weight</span></span> <span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="co">#> 1 C-3PO droid 167 kg</span></span> <span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="co">#> 2 R2-D2 droid 96 kg</span></span> <span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a><span class="co">#> 3 Yoda <NA> 66 kg</span></span> <span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a><span class="co">#> 4 R4-P17 droid <NA></span></span></code></pre></div> </div> <div class="footnotes footnotes-end-of-document"> <hr /> <ol> <li id="fn1"><p>A number of tags (including <code><p></code> and <code><li>)</code> don’t require end tags, but I think it’s best to include them because it makes seeing the structure of the HTML a little easier.<a href="#fnref1" class="footnote-back">↩︎</a></p></li> <li id="fn2"><p>This class comes from the <a href="https://xml2.r-lib.org">xml2</a> package. xml2 is a low-level package that rvest builds on top of.<a href="#fnref2" class="footnote-back">↩︎</a></p></li> <li id="fn3"><p>Or another element, more on that shortly.<a href="#fnref3" class="footnote-back">↩︎</a></p></li> </ol> </div> <!-- code folding --> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>