EVOLUTION-MANAGER
Edit File: pivot.html
<!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta name="generator" content="pandoc" /> <meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <title>Pivoting</title> <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); </script> <script>// Hide empty <a> tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> // v0.0.1 // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. document.addEventListener('DOMContentLoaded', function() { const codeList = document.getElementsByClassName("sourceCode"); for (var i = 0; i < codeList.length; i++) { var linkList = codeList[i].getElementsByTagName('a'); for (var j = 0; j < linkList.length; j++) { if (linkList[j].innerHTML === "") { linkList[j].setAttribute('aria-hidden', 'true'); } } } }); </script> <style type="text/css">code{white-space: pre;}</style> <style type="text/css" data-origin="pandoc"> pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } code.sourceCode > span { color: inherit; text-decoration: inherit; } div.sourceCode { margin: 1em 0; } pre.sourceCode { margin: 0; } @media screen { div.sourceCode { overflow: auto; } } @media print { pre > code.sourceCode { white-space: pre-wrap; } pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } } pre.numberSource code { counter-reset: source-line 0; } pre.numberSource code > span { position: relative; left: -4em; counter-increment: source-line; } pre.numberSource code > span > a:first-child::before { content: counter(source-line); position: relative; left: -1em; text-align: right; vertical-align: baseline; border: none; display: inline-block; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; color: #aaaaaa; } pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } code span.al { color: #ff0000; font-weight: bold; } /* Alert */ code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */ code span.at { color: #7d9029; } /* Attribute */ code span.bn { color: #40a070; } /* BaseN */ code span.bu { } /* BuiltIn */ code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */ code span.ch { color: #4070a0; } /* Char */ code span.cn { color: #880000; } /* Constant */ code span.co { color: #60a0b0; font-style: italic; } /* Comment */ code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */ code span.do { color: #ba2121; font-style: italic; } /* Documentation */ code span.dt { color: #902000; } /* DataType */ code span.dv { color: #40a070; } /* DecVal */ code span.er { color: #ff0000; font-weight: bold; } /* Error */ code span.ex { } /* Extension */ code span.fl { color: #40a070; } /* Float */ code span.fu { color: #06287e; } /* Function */ code span.im { } /* Import */ code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */ code span.kw { color: #007020; font-weight: bold; } /* Keyword */ code span.op { color: #666666; } /* Operator */ code span.ot { color: #007020; } /* Other */ code span.pp { color: #bc7a00; } /* Preprocessor */ code span.sc { color: #4070a0; } /* SpecialChar */ code span.ss { color: #bb6688; } /* SpecialString */ code span.st { color: #4070a0; } /* String */ code span.va { color: #19177c; } /* Variable */ code span.vs { color: #4070a0; } /* VerbatimString */ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */ </style> <script> // apply pandoc div.sourceCode style to pre.sourceCode instead (function() { var sheets = document.styleSheets; for (var i = 0; i < sheets.length; i++) { if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue; try { var rules = sheets[i].cssRules; } catch (e) { continue; } for (var j = 0; j < rules.length; j++) { var rule = rules[j]; // check if there is a div.sourceCode rule if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") continue; var style = rule.style.cssText; // check if color or background-color is set if (rule.style.color === '' && rule.style.backgroundColor === '') continue; // replace div.sourceCode by a pre.sourceCode rule sheets[i].deleteRule(j); sheets[i].insertRule('pre.sourceCode{' + style + '}', j); } } })(); </script> <style type="text/css">body { background-color: #fff; margin: 1em auto; max-width: 700px; overflow: visible; padding-left: 2em; padding-right: 2em; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; font-size: 14px; line-height: 1.35; } #TOC { clear: both; margin: 0 0 10px 10px; padding: 4px; width: 400px; border: 1px solid #CCCCCC; border-radius: 5px; background-color: #f6f6f6; font-size: 13px; line-height: 1.3; } #TOC .toctitle { font-weight: bold; font-size: 15px; margin-left: 5px; } #TOC ul { padding-left: 40px; margin-left: -1.5em; margin-top: 5px; margin-bottom: 5px; } #TOC ul ul { margin-left: -2em; } #TOC li { line-height: 16px; } table { margin: 1em auto; border-width: 1px; border-color: #DDDDDD; border-style: outset; border-collapse: collapse; } table th { border-width: 2px; padding: 5px; border-style: inset; } table td { border-width: 1px; border-style: inset; line-height: 18px; padding: 5px 5px; } table, table th, table td { border-left-style: none; border-right-style: none; } table thead, table tr.even { background-color: #f7f7f7; } p { margin: 0.5em 0; } blockquote { background-color: #f6f6f6; padding: 0.25em 0.75em; } hr { border-style: solid; border: none; border-top: 1px solid #777; margin: 28px 0; } dl { margin-left: 0; } dl dd { margin-bottom: 13px; margin-left: 13px; } dl dt { font-weight: bold; } ul { margin-top: 0; } ul li { list-style: circle outside; } ul ul { margin-bottom: 0; } pre, code { background-color: #f7f7f7; border-radius: 3px; color: #333; white-space: pre-wrap; } pre { border-radius: 3px; margin: 5px 0px 10px 0px; padding: 10px; } pre:not([class]) { background-color: #f7f7f7; } code { font-family: Consolas, Monaco, 'Courier New', monospace; font-size: 85%; } p > code, li > code { padding: 2px 0px; } div.figure { text-align: center; } img { background-color: #FFFFFF; padding: 2px; border: 1px solid #DDDDDD; border-radius: 3px; border: 1px solid #CCCCCC; margin: 0 5px; } h1 { margin-top: 0; font-size: 35px; line-height: 40px; } h2 { border-bottom: 4px solid #f7f7f7; padding-top: 10px; padding-bottom: 2px; font-size: 145%; } h3 { border-bottom: 2px solid #f7f7f7; padding-top: 10px; font-size: 120%; } h4 { border-bottom: 1px solid #f7f7f7; margin-left: 8px; font-size: 105%; } h5, h6 { border-bottom: 1px solid #ccc; font-size: 105%; } a { color: #0033dd; text-decoration: none; } a:hover { color: #6666ff; } a:visited { color: #800080; } a:visited:hover { color: #BB00BB; } a[href^="http:"] { text-decoration: underline; } a[href^="https:"] { text-decoration: underline; } code > span.kw { color: #555; font-weight: bold; } code > span.dt { color: #902000; } code > span.dv { color: #40a070; } code > span.bn { color: #d14; } code > span.fl { color: #d14; } code > span.ch { color: #d14; } code > span.st { color: #d14; } code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } </style> </head> <body> <h1 class="title toc-ignore">Pivoting</h1> <div id="introduction" class="section level1"> <h1>Introduction</h1> <p>This vignette describes the use of the new <code>pivot_longer()</code> and <code>pivot_wider()</code> functions. Their goal is to improve the usability of <code>gather()</code> and <code>spread()</code>, and incorporate state-of-the-art features found in other packages.</p> <p>For some time, it’s been obvious that there is something fundamentally wrong with the design of <code>spread()</code> and <code>gather()</code>. Many people don’t find the names intuitive and find it hard to remember which direction corresponds to spreading and which to gathering. It also seems surprisingly hard to remember the arguments to these functions, meaning that many people (including me!) have to consult the documentation every time.</p> <p>There are two important new features inspired by other R packages that have been advancing reshaping in R:</p> <ul> <li><p><code>pivot_longer()</code> can work with multiple value variables that may have different types, inspired by the enhanced <code>melt()</code> and <code>dcast()</code> functions provided by the <a href="https://github.com/Rdatatable/data.table/wiki">data.table</a> package by Matt Dowle and Arun Srinivasan.</p></li> <li><p><code>pivot_longer()</code> and <code>pivot_wider()</code> can take a data frame that specifies precisely how metadata stored in column names becomes data variables (and vice versa), inspired by the <a href="https://winvector.github.io/cdata/">cdata</a> package by John Mount and Nina Zumel.</p></li> </ul> <p>In this vignette, you’ll learn the key ideas behind <code>pivot_longer()</code> and <code>pivot_wider()</code> as you see them used to solve a variety of data reshaping challenges ranging from simple to complex.</p> <p>To begin we’ll load some needed packages. In real analysis code, I’d imagine you’d do with the <code>library(tidyverse)</code>, but I can’t do that here since this vignette is embedded in a package.</p> <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1"></a><span class="kw">library</span>(tidyr)</span> <span id="cb1-2"><a href="#cb1-2"></a><span class="kw">library</span>(dplyr)</span> <span id="cb1-3"><a href="#cb1-3"></a><span class="kw">library</span>(readr)</span></code></pre></div> </div> <div id="longer" class="section level1"> <h1>Longer</h1> <p><code>pivot_longer()</code> makes datasets <strong>longer</strong> by increasing the number of rows and decreasing the number of columns. I don’t believe it makes sense to describe a dataset as being in “long form”. Length is a relative term, and you can only say (e.g.) that dataset A is longer than dataset B.</p> <p><code>pivot_longer()</code> is commonly needed to tidy wild-caught datasets as they often optimise for ease of data entry or ease of comparison rather than ease of analysis. The following sections show how to use <code>pivot_longer()</code> for a wide range of realistic datasets.</p> <div id="pew" class="section level2"> <h2>String data in column names</h2> <p>The <code>relig_income</code> dataset stores counts based on a survey which (among other things) asked people about their religion and annual income:</p> <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1"></a>relig_income</span> <span id="cb2-2"><a href="#cb2-2"></a><span class="co">#> # A tibble: 18 x 11</span></span> <span id="cb2-3"><a href="#cb2-3"></a><span class="co">#> religion `<$10k` `$10-20k` `$20-30k` `$30-40k` `$40-50k` `$50-75k` `$75-100k`</span></span> <span id="cb2-4"><a href="#cb2-4"></a><span class="co">#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb2-5"><a href="#cb2-5"></a><span class="co">#> 1 Agnostic 27 34 60 81 76 137 122</span></span> <span id="cb2-6"><a href="#cb2-6"></a><span class="co">#> 2 Atheist 12 27 37 52 35 70 73</span></span> <span id="cb2-7"><a href="#cb2-7"></a><span class="co">#> 3 Buddhist 27 21 30 34 33 58 62</span></span> <span id="cb2-8"><a href="#cb2-8"></a><span class="co">#> 4 Catholic 418 617 732 670 638 1116 949</span></span> <span id="cb2-9"><a href="#cb2-9"></a><span class="co">#> 5 Don’t k… 15 14 15 11 10 35 21</span></span> <span id="cb2-10"><a href="#cb2-10"></a><span class="co">#> 6 Evangel… 575 869 1064 982 881 1486 949</span></span> <span id="cb2-11"><a href="#cb2-11"></a><span class="co">#> 7 Hindu 1 9 7 9 11 34 47</span></span> <span id="cb2-12"><a href="#cb2-12"></a><span class="co">#> 8 Histori… 228 244 236 238 197 223 131</span></span> <span id="cb2-13"><a href="#cb2-13"></a><span class="co">#> 9 Jehovah… 20 27 24 24 21 30 15</span></span> <span id="cb2-14"><a href="#cb2-14"></a><span class="co">#> 10 Jewish 19 19 25 25 30 95 69</span></span> <span id="cb2-15"><a href="#cb2-15"></a><span class="co">#> # … with 8 more rows, and 3 more variables: `$100-150k` <dbl>, `>150k` <dbl>,</span></span> <span id="cb2-16"><a href="#cb2-16"></a><span class="co">#> # `Don't know/refused` <dbl></span></span></code></pre></div> <p>This dataset contains three variables:</p> <ul> <li><code>religion</code>, stored in the rows,</li> <li><code>income</code> spread across the column names, and</li> <li><code>count</code> stored in the cell values.</li> </ul> <p>To tidy it we use <code>pivot_longer()</code>:</p> <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1"></a>relig_income <span class="op">%>%</span><span class="st"> </span></span> <span id="cb3-2"><a href="#cb3-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(<span class="op">!</span>religion, <span class="dt">names_to =</span> <span class="st">"income"</span>, <span class="dt">values_to =</span> <span class="st">"count"</span>)</span> <span id="cb3-3"><a href="#cb3-3"></a><span class="co">#> # A tibble: 180 x 3</span></span> <span id="cb3-4"><a href="#cb3-4"></a><span class="co">#> religion income count</span></span> <span id="cb3-5"><a href="#cb3-5"></a><span class="co">#> <chr> <chr> <dbl></span></span> <span id="cb3-6"><a href="#cb3-6"></a><span class="co">#> 1 Agnostic <$10k 27</span></span> <span id="cb3-7"><a href="#cb3-7"></a><span class="co">#> 2 Agnostic $10-20k 34</span></span> <span id="cb3-8"><a href="#cb3-8"></a><span class="co">#> 3 Agnostic $20-30k 60</span></span> <span id="cb3-9"><a href="#cb3-9"></a><span class="co">#> 4 Agnostic $30-40k 81</span></span> <span id="cb3-10"><a href="#cb3-10"></a><span class="co">#> 5 Agnostic $40-50k 76</span></span> <span id="cb3-11"><a href="#cb3-11"></a><span class="co">#> 6 Agnostic $50-75k 137</span></span> <span id="cb3-12"><a href="#cb3-12"></a><span class="co">#> 7 Agnostic $75-100k 122</span></span> <span id="cb3-13"><a href="#cb3-13"></a><span class="co">#> 8 Agnostic $100-150k 109</span></span> <span id="cb3-14"><a href="#cb3-14"></a><span class="co">#> 9 Agnostic >150k 84</span></span> <span id="cb3-15"><a href="#cb3-15"></a><span class="co">#> 10 Agnostic Don't know/refused 96</span></span> <span id="cb3-16"><a href="#cb3-16"></a><span class="co">#> # … with 170 more rows</span></span></code></pre></div> <ul> <li><p>The first argument is the dataset to reshape, <code>relig_income</code>.</p></li> <li><p>The second argument describes which columns need to be reshaped. In this case, it’s every column apart from <code>religion</code>.</p></li> <li><p>The <code>names_to</code> gives the name of the variable that will be created from the data stored in the column names, i.e. <code>income</code>.</p></li> <li><p>The <code>values_to</code> gives the name of the variable that will be created from the data stored in the cell value, i.e. <code>count</code>.</p></li> </ul> <p>Neither the <code>names_to</code> nor the <code>values_to</code> column exists in <code>relig_income</code>, so we provide them as character strings surrounded in quotes.</p> </div> <div id="billboard" class="section level2"> <h2>Numeric data in column names</h2> <p>The <code>billboard</code> dataset records the billboard rank of songs in the year 2000. It has a form similar to the <code>relig_income</code> data, but the data encoded in the column names is really a number, not a string.</p> <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1"></a>billboard</span> <span id="cb4-2"><a href="#cb4-2"></a><span class="co">#> # A tibble: 317 x 79</span></span> <span id="cb4-3"><a href="#cb4-3"></a><span class="co">#> artist track date.entered wk1 wk2 wk3 wk4 wk5 wk6 wk7 wk8</span></span> <span id="cb4-4"><a href="#cb4-4"></a><span class="co">#> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb4-5"><a href="#cb4-5"></a><span class="co">#> 1 2 Pac Baby… 2000-02-26 87 82 72 77 87 94 99 NA</span></span> <span id="cb4-6"><a href="#cb4-6"></a><span class="co">#> 2 2Ge+h… The … 2000-09-02 91 87 92 NA NA NA NA NA</span></span> <span id="cb4-7"><a href="#cb4-7"></a><span class="co">#> 3 3 Doo… Kryp… 2000-04-08 81 70 68 67 66 57 54 53</span></span> <span id="cb4-8"><a href="#cb4-8"></a><span class="co">#> 4 3 Doo… Loser 2000-10-21 76 76 72 69 67 65 55 59</span></span> <span id="cb4-9"><a href="#cb4-9"></a><span class="co">#> 5 504 B… Wobb… 2000-04-15 57 34 25 17 17 31 36 49</span></span> <span id="cb4-10"><a href="#cb4-10"></a><span class="co">#> 6 98^0 Give… 2000-08-19 51 39 34 26 26 19 2 2</span></span> <span id="cb4-11"><a href="#cb4-11"></a><span class="co">#> 7 A*Tee… Danc… 2000-07-08 97 97 96 95 100 NA NA NA</span></span> <span id="cb4-12"><a href="#cb4-12"></a><span class="co">#> 8 Aaliy… I Do… 2000-01-29 84 62 51 41 38 35 35 38</span></span> <span id="cb4-13"><a href="#cb4-13"></a><span class="co">#> 9 Aaliy… Try … 2000-03-18 59 53 38 28 21 18 16 14</span></span> <span id="cb4-14"><a href="#cb4-14"></a><span class="co">#> 10 Adams… Open… 2000-08-26 76 76 74 69 68 67 61 58</span></span> <span id="cb4-15"><a href="#cb4-15"></a><span class="co">#> # … with 307 more rows, and 68 more variables: wk9 <dbl>, wk10 <dbl>,</span></span> <span id="cb4-16"><a href="#cb4-16"></a><span class="co">#> # wk11 <dbl>, wk12 <dbl>, wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>,</span></span> <span id="cb4-17"><a href="#cb4-17"></a><span class="co">#> # wk17 <dbl>, wk18 <dbl>, wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>,</span></span> <span id="cb4-18"><a href="#cb4-18"></a><span class="co">#> # wk23 <dbl>, wk24 <dbl>, wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>,</span></span> <span id="cb4-19"><a href="#cb4-19"></a><span class="co">#> # wk29 <dbl>, wk30 <dbl>, wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>,</span></span> <span id="cb4-20"><a href="#cb4-20"></a><span class="co">#> # wk35 <dbl>, wk36 <dbl>, wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>,</span></span> <span id="cb4-21"><a href="#cb4-21"></a><span class="co">#> # wk41 <dbl>, wk42 <dbl>, wk43 <dbl>, wk44 <dbl>, wk45 <dbl>, wk46 <dbl>,</span></span> <span id="cb4-22"><a href="#cb4-22"></a><span class="co">#> # wk47 <dbl>, wk48 <dbl>, wk49 <dbl>, wk50 <dbl>, wk51 <dbl>, wk52 <dbl>,</span></span> <span id="cb4-23"><a href="#cb4-23"></a><span class="co">#> # wk53 <dbl>, wk54 <dbl>, wk55 <dbl>, wk56 <dbl>, wk57 <dbl>, wk58 <dbl>,</span></span> <span id="cb4-24"><a href="#cb4-24"></a><span class="co">#> # wk59 <dbl>, wk60 <dbl>, wk61 <dbl>, wk62 <dbl>, wk63 <dbl>, wk64 <dbl>,</span></span> <span id="cb4-25"><a href="#cb4-25"></a><span class="co">#> # wk65 <dbl>, wk66 <lgl>, wk67 <lgl>, wk68 <lgl>, wk69 <lgl>, wk70 <lgl>,</span></span> <span id="cb4-26"><a href="#cb4-26"></a><span class="co">#> # wk71 <lgl>, wk72 <lgl>, wk73 <lgl>, wk74 <lgl>, wk75 <lgl>, wk76 <lgl></span></span></code></pre></div> <p>We can start with the same basic specification as for the <code>relig_income</code> dataset. Here we want the names to become a variable called <code>week</code>, and the values to become a variable called <code>rank</code>. I also use <code>values_drop_na</code> to drop rows that correspond to missing values. Not every song stays in the charts for all 76 weeks, so the structure of the input data force the creation of unnessary explicit <code>NA</code>s.</p> <div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1"></a>billboard <span class="op">%>%</span><span class="st"> </span></span> <span id="cb5-2"><a href="#cb5-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb5-3"><a href="#cb5-3"></a> <span class="dt">cols =</span> <span class="kw">starts_with</span>(<span class="st">"wk"</span>), </span> <span id="cb5-4"><a href="#cb5-4"></a> <span class="dt">names_to =</span> <span class="st">"week"</span>, </span> <span id="cb5-5"><a href="#cb5-5"></a> <span class="dt">values_to =</span> <span class="st">"rank"</span>,</span> <span id="cb5-6"><a href="#cb5-6"></a> <span class="dt">values_drop_na =</span> <span class="ot">TRUE</span></span> <span id="cb5-7"><a href="#cb5-7"></a> )</span> <span id="cb5-8"><a href="#cb5-8"></a><span class="co">#> # A tibble: 5,307 x 5</span></span> <span id="cb5-9"><a href="#cb5-9"></a><span class="co">#> artist track date.entered week rank</span></span> <span id="cb5-10"><a href="#cb5-10"></a><span class="co">#> <chr> <chr> <date> <chr> <dbl></span></span> <span id="cb5-11"><a href="#cb5-11"></a><span class="co">#> 1 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk1 87</span></span> <span id="cb5-12"><a href="#cb5-12"></a><span class="co">#> 2 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk2 82</span></span> <span id="cb5-13"><a href="#cb5-13"></a><span class="co">#> 3 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk3 72</span></span> <span id="cb5-14"><a href="#cb5-14"></a><span class="co">#> 4 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk4 77</span></span> <span id="cb5-15"><a href="#cb5-15"></a><span class="co">#> 5 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk5 87</span></span> <span id="cb5-16"><a href="#cb5-16"></a><span class="co">#> 6 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk6 94</span></span> <span id="cb5-17"><a href="#cb5-17"></a><span class="co">#> 7 2 Pac Baby Don't Cry (Keep... 2000-02-26 wk7 99</span></span> <span id="cb5-18"><a href="#cb5-18"></a><span class="co">#> 8 2Ge+her The Hardest Part Of ... 2000-09-02 wk1 91</span></span> <span id="cb5-19"><a href="#cb5-19"></a><span class="co">#> 9 2Ge+her The Hardest Part Of ... 2000-09-02 wk2 87</span></span> <span id="cb5-20"><a href="#cb5-20"></a><span class="co">#> 10 2Ge+her The Hardest Part Of ... 2000-09-02 wk3 92</span></span> <span id="cb5-21"><a href="#cb5-21"></a><span class="co">#> # … with 5,297 more rows</span></span></code></pre></div> <p>It would be nice to easily determine how long each song stayed in the charts, but to do that, we’ll need to convert the <code>week</code> variable to an integer. We can do that by using two additional arguments: <code>names_prefix</code> strips off the <code>wk</code> prefix, and <code>names_transform</code> converts <code>week</code> into an integer:</p> <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1"></a>billboard <span class="op">%>%</span><span class="st"> </span></span> <span id="cb6-2"><a href="#cb6-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb6-3"><a href="#cb6-3"></a> <span class="dt">cols =</span> <span class="kw">starts_with</span>(<span class="st">"wk"</span>), </span> <span id="cb6-4"><a href="#cb6-4"></a> <span class="dt">names_to =</span> <span class="st">"week"</span>, </span> <span id="cb6-5"><a href="#cb6-5"></a> <span class="dt">names_prefix =</span> <span class="st">"wk"</span>,</span> <span id="cb6-6"><a href="#cb6-6"></a> <span class="dt">names_transform =</span> <span class="kw">list</span>(<span class="dt">week =</span> as.integer),</span> <span id="cb6-7"><a href="#cb6-7"></a> <span class="dt">values_to =</span> <span class="st">"rank"</span>,</span> <span id="cb6-8"><a href="#cb6-8"></a> <span class="dt">values_drop_na =</span> <span class="ot">TRUE</span>,</span> <span id="cb6-9"><a href="#cb6-9"></a> )</span></code></pre></div> <p>Alternatively, you could do this with a single argument by using <code>readr::parse_number()</code> which automatically strips non-numeric components:</p> <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1"></a>billboard <span class="op">%>%</span><span class="st"> </span></span> <span id="cb7-2"><a href="#cb7-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb7-3"><a href="#cb7-3"></a> <span class="dt">cols =</span> <span class="kw">starts_with</span>(<span class="st">"wk"</span>), </span> <span id="cb7-4"><a href="#cb7-4"></a> <span class="dt">names_to =</span> <span class="st">"week"</span>, </span> <span id="cb7-5"><a href="#cb7-5"></a> <span class="dt">names_transform =</span> <span class="kw">list</span>(<span class="dt">week =</span> readr<span class="op">::</span>parse_number),</span> <span id="cb7-6"><a href="#cb7-6"></a> <span class="dt">values_to =</span> <span class="st">"rank"</span>,</span> <span id="cb7-7"><a href="#cb7-7"></a> <span class="dt">values_drop_na =</span> <span class="ot">TRUE</span>,</span> <span id="cb7-8"><a href="#cb7-8"></a> )</span></code></pre></div> </div> <div id="many-variables-in-column-names" class="section level2"> <h2>Many variables in column names</h2> <p>A more challenging situation occurs when you have multiple variables crammed into the column names. For example, take the <code>who</code> dataset:</p> <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1"></a>who</span> <span id="cb8-2"><a href="#cb8-2"></a><span class="co">#> # A tibble: 7,240 x 60</span></span> <span id="cb8-3"><a href="#cb8-3"></a><span class="co">#> country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544</span></span> <span id="cb8-4"><a href="#cb8-4"></a><span class="co">#> <chr> <chr> <chr> <int> <int> <int> <int> <int></span></span> <span id="cb8-5"><a href="#cb8-5"></a><span class="co">#> 1 Afghan… AF AFG 1980 NA NA NA NA</span></span> <span id="cb8-6"><a href="#cb8-6"></a><span class="co">#> 2 Afghan… AF AFG 1981 NA NA NA NA</span></span> <span id="cb8-7"><a href="#cb8-7"></a><span class="co">#> 3 Afghan… AF AFG 1982 NA NA NA NA</span></span> <span id="cb8-8"><a href="#cb8-8"></a><span class="co">#> 4 Afghan… AF AFG 1983 NA NA NA NA</span></span> <span id="cb8-9"><a href="#cb8-9"></a><span class="co">#> 5 Afghan… AF AFG 1984 NA NA NA NA</span></span> <span id="cb8-10"><a href="#cb8-10"></a><span class="co">#> 6 Afghan… AF AFG 1985 NA NA NA NA</span></span> <span id="cb8-11"><a href="#cb8-11"></a><span class="co">#> 7 Afghan… AF AFG 1986 NA NA NA NA</span></span> <span id="cb8-12"><a href="#cb8-12"></a><span class="co">#> 8 Afghan… AF AFG 1987 NA NA NA NA</span></span> <span id="cb8-13"><a href="#cb8-13"></a><span class="co">#> 9 Afghan… AF AFG 1988 NA NA NA NA</span></span> <span id="cb8-14"><a href="#cb8-14"></a><span class="co">#> 10 Afghan… AF AFG 1989 NA NA NA NA</span></span> <span id="cb8-15"><a href="#cb8-15"></a><span class="co">#> # … with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,</span></span> <span id="cb8-16"><a href="#cb8-16"></a><span class="co">#> # new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,</span></span> <span id="cb8-17"><a href="#cb8-17"></a><span class="co">#> # new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,</span></span> <span id="cb8-18"><a href="#cb8-18"></a><span class="co">#> # new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,</span></span> <span id="cb8-19"><a href="#cb8-19"></a><span class="co">#> # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,</span></span> <span id="cb8-20"><a href="#cb8-20"></a><span class="co">#> # new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,</span></span> <span id="cb8-21"><a href="#cb8-21"></a><span class="co">#> # new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,</span></span> <span id="cb8-22"><a href="#cb8-22"></a><span class="co">#> # new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,</span></span> <span id="cb8-23"><a href="#cb8-23"></a><span class="co">#> # new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,</span></span> <span id="cb8-24"><a href="#cb8-24"></a><span class="co">#> # new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,</span></span> <span id="cb8-25"><a href="#cb8-25"></a><span class="co">#> # new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,</span></span> <span id="cb8-26"><a href="#cb8-26"></a><span class="co">#> # new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,</span></span> <span id="cb8-27"><a href="#cb8-27"></a><span class="co">#> # new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,</span></span> <span id="cb8-28"><a href="#cb8-28"></a><span class="co">#> # new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,</span></span> <span id="cb8-29"><a href="#cb8-29"></a><span class="co">#> # newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,</span></span> <span id="cb8-30"><a href="#cb8-30"></a><span class="co">#> # newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,</span></span> <span id="cb8-31"><a href="#cb8-31"></a><span class="co">#> # newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,</span></span> <span id="cb8-32"><a href="#cb8-32"></a><span class="co">#> # newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int></span></span></code></pre></div> <p><code>country</code>, <code>iso2</code>, <code>iso3</code>, and <code>year</code> are already variables, so they can be left as is. But the columns from <code>new_sp_m014</code> to <code>newrel_f65</code> encode four variables in their names:</p> <ul> <li><p>The <code>new_</code>/<code>new</code> prefix indicates these are counts of new cases. This dataset only contains new cases, so we’ll ignore it here because it’s constant.</p></li> <li><p><code>sp</code>/<code>rel</code>/<code>ep</code> describe how the case was diagnosed.</p></li> <li><p><code>m</code>/<code>f</code> gives the gender.</p></li> <li><p><code>014</code>/<code>1524</code>/<code>2535</code>/<code>3544</code>/<code>4554</code>/<code>65</code> supplies the age range.</p></li> </ul> <p>We can break these variables up by specifying multiple column names in <code>names_to</code>, and then either providing <code>names_sep</code> or <code>names_pattern</code>. Here <code>names_pattern</code> is the most natural fit. It has a similar interface to <code>extract</code>: you give it a regular expression containing groups (defined by <code>()</code>) and it puts each group in a column.</p> <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1"></a>who <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb9-2"><a href="#cb9-2"></a> <span class="dt">cols =</span> new_sp_m014<span class="op">:</span>newrel_f65,</span> <span id="cb9-3"><a href="#cb9-3"></a> <span class="dt">names_to =</span> <span class="kw">c</span>(<span class="st">"diagnosis"</span>, <span class="st">"gender"</span>, <span class="st">"age"</span>), </span> <span id="cb9-4"><a href="#cb9-4"></a> <span class="dt">names_pattern =</span> <span class="st">"new_?(.*)_(.)(.*)"</span>,</span> <span id="cb9-5"><a href="#cb9-5"></a> <span class="dt">values_to =</span> <span class="st">"count"</span></span> <span id="cb9-6"><a href="#cb9-6"></a>)</span> <span id="cb9-7"><a href="#cb9-7"></a><span class="co">#> # A tibble: 405,440 x 8</span></span> <span id="cb9-8"><a href="#cb9-8"></a><span class="co">#> country iso2 iso3 year diagnosis gender age count</span></span> <span id="cb9-9"><a href="#cb9-9"></a><span class="co">#> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int></span></span> <span id="cb9-10"><a href="#cb9-10"></a><span class="co">#> 1 Afghanistan AF AFG 1980 sp m 014 NA</span></span> <span id="cb9-11"><a href="#cb9-11"></a><span class="co">#> 2 Afghanistan AF AFG 1980 sp m 1524 NA</span></span> <span id="cb9-12"><a href="#cb9-12"></a><span class="co">#> 3 Afghanistan AF AFG 1980 sp m 2534 NA</span></span> <span id="cb9-13"><a href="#cb9-13"></a><span class="co">#> 4 Afghanistan AF AFG 1980 sp m 3544 NA</span></span> <span id="cb9-14"><a href="#cb9-14"></a><span class="co">#> 5 Afghanistan AF AFG 1980 sp m 4554 NA</span></span> <span id="cb9-15"><a href="#cb9-15"></a><span class="co">#> 6 Afghanistan AF AFG 1980 sp m 5564 NA</span></span> <span id="cb9-16"><a href="#cb9-16"></a><span class="co">#> 7 Afghanistan AF AFG 1980 sp m 65 NA</span></span> <span id="cb9-17"><a href="#cb9-17"></a><span class="co">#> 8 Afghanistan AF AFG 1980 sp f 014 NA</span></span> <span id="cb9-18"><a href="#cb9-18"></a><span class="co">#> 9 Afghanistan AF AFG 1980 sp f 1524 NA</span></span> <span id="cb9-19"><a href="#cb9-19"></a><span class="co">#> 10 Afghanistan AF AFG 1980 sp f 2534 NA</span></span> <span id="cb9-20"><a href="#cb9-20"></a><span class="co">#> # … with 405,430 more rows</span></span></code></pre></div> <p>We could go one step further use readr functions to convert the gender and age to factors. I think this is good practice when you have categorical variables with a known set of values.</p> <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1"></a>who <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb10-2"><a href="#cb10-2"></a> <span class="dt">cols =</span> new_sp_m014<span class="op">:</span>newrel_f65,</span> <span id="cb10-3"><a href="#cb10-3"></a> <span class="dt">names_to =</span> <span class="kw">c</span>(<span class="st">"diagnosis"</span>, <span class="st">"gender"</span>, <span class="st">"age"</span>), </span> <span id="cb10-4"><a href="#cb10-4"></a> <span class="dt">names_pattern =</span> <span class="st">"new_?(.*)_(.)(.*)"</span>,</span> <span id="cb10-5"><a href="#cb10-5"></a> <span class="dt">names_transform =</span> <span class="kw">list</span>(</span> <span id="cb10-6"><a href="#cb10-6"></a> <span class="dt">gender =</span> <span class="op">~</span><span class="st"> </span>readr<span class="op">::</span><span class="kw">parse_factor</span>(.x, <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"f"</span>, <span class="st">"m"</span>)),</span> <span id="cb10-7"><a href="#cb10-7"></a> <span class="dt">age =</span> <span class="op">~</span><span class="st"> </span>readr<span class="op">::</span><span class="kw">parse_factor</span>(</span> <span id="cb10-8"><a href="#cb10-8"></a> .x,</span> <span id="cb10-9"><a href="#cb10-9"></a> <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"014"</span>, <span class="st">"1524"</span>, <span class="st">"2534"</span>, <span class="st">"3544"</span>, <span class="st">"4554"</span>, <span class="st">"5564"</span>, <span class="st">"65"</span>), </span> <span id="cb10-10"><a href="#cb10-10"></a> <span class="dt">ordered =</span> <span class="ot">TRUE</span></span> <span id="cb10-11"><a href="#cb10-11"></a> )</span> <span id="cb10-12"><a href="#cb10-12"></a> ),</span> <span id="cb10-13"><a href="#cb10-13"></a> <span class="dt">values_to =</span> <span class="st">"count"</span>,</span> <span id="cb10-14"><a href="#cb10-14"></a>)</span></code></pre></div> </div> <div id="multiple-observations-per-row" class="section level2"> <h2>Multiple observations per row</h2> <p>So far, we have been working with data frames that have one observation per row, but many important pivotting problems involve multiple observations per row. You can usually recognise this case because name of the column that you want to appear in the output is part of the column name in the input. In this section, you’ll learn how to pivot this sort of data.</p> <p>The following example is adapted from the <a href="https://CRAN.R-project.org/package=data.table/vignettes/datatable-reshape.html">data.table vignette</a>, as inspiration for tidyr’s solution to this problem.</p> <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1"></a>family <-<span class="st"> </span><span class="kw">tribble</span>(</span> <span id="cb11-2"><a href="#cb11-2"></a> <span class="op">~</span>family, <span class="op">~</span>dob_child1, <span class="op">~</span>dob_child2, <span class="op">~</span>gender_child1, <span class="op">~</span>gender_child2,</span> <span id="cb11-3"><a href="#cb11-3"></a> 1L, <span class="st">"1998-11-26"</span>, <span class="st">"2000-01-29"</span>, 1L, 2L,</span> <span id="cb11-4"><a href="#cb11-4"></a> 2L, <span class="st">"1996-06-22"</span>, <span class="ot">NA</span>, 2L, <span class="ot">NA</span>,</span> <span id="cb11-5"><a href="#cb11-5"></a> 3L, <span class="st">"2002-07-11"</span>, <span class="st">"2004-04-05"</span>, 2L, 2L,</span> <span id="cb11-6"><a href="#cb11-6"></a> 4L, <span class="st">"2004-10-10"</span>, <span class="st">"2009-08-27"</span>, 1L, 1L,</span> <span id="cb11-7"><a href="#cb11-7"></a> 5L, <span class="st">"2000-12-05"</span>, <span class="st">"2005-02-28"</span>, 2L, 1L,</span> <span id="cb11-8"><a href="#cb11-8"></a>)</span> <span id="cb11-9"><a href="#cb11-9"></a>family <-<span class="st"> </span>family <span class="op">%>%</span><span class="st"> </span><span class="kw">mutate_at</span>(<span class="kw">vars</span>(<span class="kw">starts_with</span>(<span class="st">"dob"</span>)), parse_date)</span> <span id="cb11-10"><a href="#cb11-10"></a>family</span> <span id="cb11-11"><a href="#cb11-11"></a><span class="co">#> # A tibble: 5 x 5</span></span> <span id="cb11-12"><a href="#cb11-12"></a><span class="co">#> family dob_child1 dob_child2 gender_child1 gender_child2</span></span> <span id="cb11-13"><a href="#cb11-13"></a><span class="co">#> <int> <date> <date> <int> <int></span></span> <span id="cb11-14"><a href="#cb11-14"></a><span class="co">#> 1 1 1998-11-26 2000-01-29 1 2</span></span> <span id="cb11-15"><a href="#cb11-15"></a><span class="co">#> 2 2 1996-06-22 NA 2 NA</span></span> <span id="cb11-16"><a href="#cb11-16"></a><span class="co">#> 3 3 2002-07-11 2004-04-05 2 2</span></span> <span id="cb11-17"><a href="#cb11-17"></a><span class="co">#> 4 4 2004-10-10 2009-08-27 1 1</span></span> <span id="cb11-18"><a href="#cb11-18"></a><span class="co">#> 5 5 2000-12-05 2005-02-28 2 1</span></span></code></pre></div> <p>Note that we have two pieces of information (or values) for each child: their <code>gender</code> and their <code>dob</code> (date of birth). These need to go into separate columns in the result. Again we supply multiple variables to <code>names_to</code>, using <code>names_sep</code> to split up each variable name. Note the special name <code>.value</code>: this tells <code>pivot_longer()</code> that that part of the column name specifies the “value” being measured (which will become a variable in the output).</p> <div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1"></a>family <span class="op">%>%</span><span class="st"> </span></span> <span id="cb12-2"><a href="#cb12-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb12-3"><a href="#cb12-3"></a> <span class="op">!</span>family, </span> <span id="cb12-4"><a href="#cb12-4"></a> <span class="dt">names_to =</span> <span class="kw">c</span>(<span class="st">".value"</span>, <span class="st">"child"</span>), </span> <span id="cb12-5"><a href="#cb12-5"></a> <span class="dt">names_sep =</span> <span class="st">"_"</span>, </span> <span id="cb12-6"><a href="#cb12-6"></a> <span class="dt">values_drop_na =</span> <span class="ot">TRUE</span></span> <span id="cb12-7"><a href="#cb12-7"></a> )</span> <span id="cb12-8"><a href="#cb12-8"></a><span class="co">#> # A tibble: 9 x 4</span></span> <span id="cb12-9"><a href="#cb12-9"></a><span class="co">#> family child dob gender</span></span> <span id="cb12-10"><a href="#cb12-10"></a><span class="co">#> <int> <chr> <date> <int></span></span> <span id="cb12-11"><a href="#cb12-11"></a><span class="co">#> 1 1 child1 1998-11-26 1</span></span> <span id="cb12-12"><a href="#cb12-12"></a><span class="co">#> 2 1 child2 2000-01-29 2</span></span> <span id="cb12-13"><a href="#cb12-13"></a><span class="co">#> 3 2 child1 1996-06-22 2</span></span> <span id="cb12-14"><a href="#cb12-14"></a><span class="co">#> 4 3 child1 2002-07-11 2</span></span> <span id="cb12-15"><a href="#cb12-15"></a><span class="co">#> 5 3 child2 2004-04-05 2</span></span> <span id="cb12-16"><a href="#cb12-16"></a><span class="co">#> 6 4 child1 2004-10-10 1</span></span> <span id="cb12-17"><a href="#cb12-17"></a><span class="co">#> 7 4 child2 2009-08-27 1</span></span> <span id="cb12-18"><a href="#cb12-18"></a><span class="co">#> 8 5 child1 2000-12-05 2</span></span> <span id="cb12-19"><a href="#cb12-19"></a><span class="co">#> 9 5 child2 2005-02-28 1</span></span></code></pre></div> <p>Note the use of <code>values_drop_na = TRUE</code>: the input shape forces the creation of explicit missing variables for observations that don’t exist.</p> <p>This problem also exists in the <code>anscombe</code> dataset built in to base R:</p> <div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1"></a>anscombe</span> <span id="cb13-2"><a href="#cb13-2"></a><span class="co">#> x1 x2 x3 x4 y1 y2 y3 y4</span></span> <span id="cb13-3"><a href="#cb13-3"></a><span class="co">#> 1 10 10 10 8 8.04 9.14 7.46 6.58</span></span> <span id="cb13-4"><a href="#cb13-4"></a><span class="co">#> 2 8 8 8 8 6.95 8.14 6.77 5.76</span></span> <span id="cb13-5"><a href="#cb13-5"></a><span class="co">#> 3 13 13 13 8 7.58 8.74 12.74 7.71</span></span> <span id="cb13-6"><a href="#cb13-6"></a><span class="co">#> 4 9 9 9 8 8.81 8.77 7.11 8.84</span></span> <span id="cb13-7"><a href="#cb13-7"></a><span class="co">#> 5 11 11 11 8 8.33 9.26 7.81 8.47</span></span> <span id="cb13-8"><a href="#cb13-8"></a><span class="co">#> 6 14 14 14 8 9.96 8.10 8.84 7.04</span></span> <span id="cb13-9"><a href="#cb13-9"></a><span class="co">#> 7 6 6 6 8 7.24 6.13 6.08 5.25</span></span> <span id="cb13-10"><a href="#cb13-10"></a><span class="co">#> 8 4 4 4 19 4.26 3.10 5.39 12.50</span></span> <span id="cb13-11"><a href="#cb13-11"></a><span class="co">#> 9 12 12 12 8 10.84 9.13 8.15 5.56</span></span> <span id="cb13-12"><a href="#cb13-12"></a><span class="co">#> 10 7 7 7 8 4.82 7.26 6.42 7.91</span></span> <span id="cb13-13"><a href="#cb13-13"></a><span class="co">#> 11 5 5 5 8 5.68 4.74 5.73 6.89</span></span></code></pre></div> <p>This dataset contains four pairs of variables (<code>x1</code> and <code>y1</code>, <code>x2</code> and <code>y2</code>, etc) that underlie Anscombe’s quartet, a collection of four datasets that have the same summary statistics (mean, sd, correlation etc), but have quite different data. We want to produce a dataset with columns <code>set</code>, <code>x</code> and <code>y</code>.</p> <div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1"></a>anscombe <span class="op">%>%</span><span class="st"> </span></span> <span id="cb14-2"><a href="#cb14-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(<span class="kw">everything</span>(), </span> <span id="cb14-3"><a href="#cb14-3"></a> <span class="dt">names_to =</span> <span class="kw">c</span>(<span class="st">".value"</span>, <span class="st">"set"</span>), </span> <span id="cb14-4"><a href="#cb14-4"></a> <span class="dt">names_pattern =</span> <span class="st">"(.)(.)"</span></span> <span id="cb14-5"><a href="#cb14-5"></a> ) <span class="op">%>%</span><span class="st"> </span></span> <span id="cb14-6"><a href="#cb14-6"></a><span class="st"> </span><span class="kw">arrange</span>(set)</span> <span id="cb14-7"><a href="#cb14-7"></a><span class="co">#> # A tibble: 44 x 3</span></span> <span id="cb14-8"><a href="#cb14-8"></a><span class="co">#> set x y</span></span> <span id="cb14-9"><a href="#cb14-9"></a><span class="co">#> <chr> <dbl> <dbl></span></span> <span id="cb14-10"><a href="#cb14-10"></a><span class="co">#> 1 1 10 8.04</span></span> <span id="cb14-11"><a href="#cb14-11"></a><span class="co">#> 2 1 8 6.95</span></span> <span id="cb14-12"><a href="#cb14-12"></a><span class="co">#> 3 1 13 7.58</span></span> <span id="cb14-13"><a href="#cb14-13"></a><span class="co">#> 4 1 9 8.81</span></span> <span id="cb14-14"><a href="#cb14-14"></a><span class="co">#> 5 1 11 8.33</span></span> <span id="cb14-15"><a href="#cb14-15"></a><span class="co">#> 6 1 14 9.96</span></span> <span id="cb14-16"><a href="#cb14-16"></a><span class="co">#> 7 1 6 7.24</span></span> <span id="cb14-17"><a href="#cb14-17"></a><span class="co">#> 8 1 4 4.26</span></span> <span id="cb14-18"><a href="#cb14-18"></a><span class="co">#> 9 1 12 10.8 </span></span> <span id="cb14-19"><a href="#cb14-19"></a><span class="co">#> 10 1 7 4.82</span></span> <span id="cb14-20"><a href="#cb14-20"></a><span class="co">#> # … with 34 more rows</span></span></code></pre></div> <p>A similar situation can arise with panel data. For example, take this example dataset provided by <a href="https://github.com/leeper/rio/issues/193">Thomas Leeper</a>. We can tidy it using the same approach as for <code>anscombe</code>:</p> <div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1"></a>pnl <-<span class="st"> </span><span class="kw">tibble</span>(</span> <span id="cb15-2"><a href="#cb15-2"></a> <span class="dt">x =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">4</span>,</span> <span id="cb15-3"><a href="#cb15-3"></a> <span class="dt">a =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="dv">1</span>,<span class="dv">0</span>, <span class="dv">0</span>),</span> <span id="cb15-4"><a href="#cb15-4"></a> <span class="dt">b =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="dv">1</span>, <span class="dv">1</span>, <span class="dv">1</span>),</span> <span id="cb15-5"><a href="#cb15-5"></a> <span class="dt">y1 =</span> <span class="kw">rnorm</span>(<span class="dv">4</span>),</span> <span id="cb15-6"><a href="#cb15-6"></a> <span class="dt">y2 =</span> <span class="kw">rnorm</span>(<span class="dv">4</span>),</span> <span id="cb15-7"><a href="#cb15-7"></a> <span class="dt">z1 =</span> <span class="kw">rep</span>(<span class="dv">3</span>, <span class="dv">4</span>),</span> <span id="cb15-8"><a href="#cb15-8"></a> <span class="dt">z2 =</span> <span class="kw">rep</span>(<span class="op">-</span><span class="dv">2</span>, <span class="dv">4</span>),</span> <span id="cb15-9"><a href="#cb15-9"></a>)</span> <span id="cb15-10"><a href="#cb15-10"></a></span> <span id="cb15-11"><a href="#cb15-11"></a>pnl <span class="op">%>%</span><span class="st"> </span></span> <span id="cb15-12"><a href="#cb15-12"></a><span class="st"> </span><span class="kw">pivot_longer</span>(</span> <span id="cb15-13"><a href="#cb15-13"></a> <span class="op">!</span><span class="kw">c</span>(x, a, b), </span> <span id="cb15-14"><a href="#cb15-14"></a> <span class="dt">names_to =</span> <span class="kw">c</span>(<span class="st">".value"</span>, <span class="st">"time"</span>), </span> <span id="cb15-15"><a href="#cb15-15"></a> <span class="dt">names_pattern =</span> <span class="st">"(.)(.)"</span></span> <span id="cb15-16"><a href="#cb15-16"></a> )</span> <span id="cb15-17"><a href="#cb15-17"></a><span class="co">#> # A tibble: 8 x 6</span></span> <span id="cb15-18"><a href="#cb15-18"></a><span class="co">#> x a b time y z</span></span> <span id="cb15-19"><a href="#cb15-19"></a><span class="co">#> <int> <dbl> <dbl> <chr> <dbl> <dbl></span></span> <span id="cb15-20"><a href="#cb15-20"></a><span class="co">#> 1 1 1 0 1 -2.14 3</span></span> <span id="cb15-21"><a href="#cb15-21"></a><span class="co">#> 2 1 1 0 2 0.0877 -2</span></span> <span id="cb15-22"><a href="#cb15-22"></a><span class="co">#> 3 2 1 1 1 -0.609 3</span></span> <span id="cb15-23"><a href="#cb15-23"></a><span class="co">#> 4 2 1 1 2 -0.593 -2</span></span> <span id="cb15-24"><a href="#cb15-24"></a><span class="co">#> 5 3 0 1 1 0.354 3</span></span> <span id="cb15-25"><a href="#cb15-25"></a><span class="co">#> 6 3 0 1 2 2.43 -2</span></span> <span id="cb15-26"><a href="#cb15-26"></a><span class="co">#> 7 4 0 1 1 0.607 3</span></span> <span id="cb15-27"><a href="#cb15-27"></a><span class="co">#> 8 4 0 1 2 -1.02 -2</span></span></code></pre></div> </div> <div id="duplicated-column-names" class="section level2"> <h2>Duplicated column names</h2> <p>Occassionally you will come across datasets that have duplicated column names. Generally, such datasets are hard to work with in R, because when you refer to a column by name it only finds the first match. To create a tibble with duplicated names, you have to explicitly opt out of the name repair that usually prevents you from creating such a dataset:</p> <div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1"></a>df <-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">id =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>, <span class="dt">y =</span> <span class="dv">4</span><span class="op">:</span><span class="dv">6</span>, <span class="dt">y =</span> <span class="dv">5</span><span class="op">:</span><span class="dv">7</span>, <span class="dt">y =</span> <span class="dv">7</span><span class="op">:</span><span class="dv">9</span>, <span class="dt">.name_repair =</span> <span class="st">"minimal"</span>)</span> <span id="cb16-2"><a href="#cb16-2"></a>df</span> <span id="cb16-3"><a href="#cb16-3"></a><span class="co">#> # A tibble: 3 x 4</span></span> <span id="cb16-4"><a href="#cb16-4"></a><span class="co">#> id y y y</span></span> <span id="cb16-5"><a href="#cb16-5"></a><span class="co">#> <int> <int> <int> <int></span></span> <span id="cb16-6"><a href="#cb16-6"></a><span class="co">#> 1 1 4 5 7</span></span> <span id="cb16-7"><a href="#cb16-7"></a><span class="co">#> 2 2 5 6 8</span></span> <span id="cb16-8"><a href="#cb16-8"></a><span class="co">#> 3 3 6 7 9</span></span></code></pre></div> <p>When <code>pivot_longer()</code> encounters such data, it automatically adds another column to the output:</p> <div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1"></a>df <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_longer</span>(<span class="op">!</span>id, <span class="dt">names_to =</span> <span class="st">"name"</span>, <span class="dt">values_to =</span> <span class="st">"value"</span>)</span> <span id="cb17-2"><a href="#cb17-2"></a><span class="co">#> # A tibble: 9 x 3</span></span> <span id="cb17-3"><a href="#cb17-3"></a><span class="co">#> id name value</span></span> <span id="cb17-4"><a href="#cb17-4"></a><span class="co">#> <int> <chr> <int></span></span> <span id="cb17-5"><a href="#cb17-5"></a><span class="co">#> 1 1 y 4</span></span> <span id="cb17-6"><a href="#cb17-6"></a><span class="co">#> 2 1 y 5</span></span> <span id="cb17-7"><a href="#cb17-7"></a><span class="co">#> 3 1 y 7</span></span> <span id="cb17-8"><a href="#cb17-8"></a><span class="co">#> 4 2 y 5</span></span> <span id="cb17-9"><a href="#cb17-9"></a><span class="co">#> 5 2 y 6</span></span> <span id="cb17-10"><a href="#cb17-10"></a><span class="co">#> 6 2 y 8</span></span> <span id="cb17-11"><a href="#cb17-11"></a><span class="co">#> 7 3 y 6</span></span> <span id="cb17-12"><a href="#cb17-12"></a><span class="co">#> 8 3 y 7</span></span> <span id="cb17-13"><a href="#cb17-13"></a><span class="co">#> 9 3 y 9</span></span></code></pre></div> <p>A similar process is applied when multiple input columns are mapped to the same output column, as in the following example where we ignore the numeric suffix on each column name:</p> <div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1"></a>df <-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">id =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>, <span class="dt">x1 =</span> <span class="dv">4</span><span class="op">:</span><span class="dv">6</span>, <span class="dt">x2 =</span> <span class="dv">5</span><span class="op">:</span><span class="dv">7</span>, <span class="dt">y1 =</span> <span class="dv">7</span><span class="op">:</span><span class="dv">9</span>, <span class="dt">y2 =</span> <span class="dv">10</span><span class="op">:</span><span class="dv">12</span>)</span> <span id="cb18-2"><a href="#cb18-2"></a>df <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_longer</span>(<span class="op">!</span>id, <span class="dt">names_to =</span> <span class="st">".value"</span>, <span class="dt">names_pattern =</span> <span class="st">"(.)."</span>)</span> <span id="cb18-3"><a href="#cb18-3"></a><span class="co">#> # A tibble: 6 x 3</span></span> <span id="cb18-4"><a href="#cb18-4"></a><span class="co">#> id x y</span></span> <span id="cb18-5"><a href="#cb18-5"></a><span class="co">#> <int> <int> <int></span></span> <span id="cb18-6"><a href="#cb18-6"></a><span class="co">#> 1 1 4 7</span></span> <span id="cb18-7"><a href="#cb18-7"></a><span class="co">#> 2 1 5 10</span></span> <span id="cb18-8"><a href="#cb18-8"></a><span class="co">#> 3 2 5 8</span></span> <span id="cb18-9"><a href="#cb18-9"></a><span class="co">#> 4 2 6 11</span></span> <span id="cb18-10"><a href="#cb18-10"></a><span class="co">#> 5 3 6 9</span></span> <span id="cb18-11"><a href="#cb18-11"></a><span class="co">#> 6 3 7 12</span></span></code></pre></div> </div> </div> <div id="wider" class="section level1"> <h1>Wider</h1> <p><code>pivot_wider()</code> is the opposite of <code>pivot_longer()</code>: it makes a dataset <strong>wider</strong> by increasing the number of columns and decreasing the number of rows. It’s relatively rare to need <code>pivot_wider()</code> to make tidy data, but it’s often useful for creating summary tables for presentation, or data in a format needed by other tools.</p> <div id="capture-recapture-data" class="section level2"> <h2>Capture-recapture data</h2> <p>The <code>fish_encounters</code> dataset, contributed by <a href="https://fishsciences.github.io/post/visualizing-fish-encounter-histories/">Myfanwy Johnston</a>, describes when fish swimming down a river are detected by automatic monitoring stations:</p> <div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1"></a>fish_encounters</span> <span id="cb19-2"><a href="#cb19-2"></a><span class="co">#> # A tibble: 114 x 3</span></span> <span id="cb19-3"><a href="#cb19-3"></a><span class="co">#> fish station seen</span></span> <span id="cb19-4"><a href="#cb19-4"></a><span class="co">#> <fct> <fct> <int></span></span> <span id="cb19-5"><a href="#cb19-5"></a><span class="co">#> 1 4842 Release 1</span></span> <span id="cb19-6"><a href="#cb19-6"></a><span class="co">#> 2 4842 I80_1 1</span></span> <span id="cb19-7"><a href="#cb19-7"></a><span class="co">#> 3 4842 Lisbon 1</span></span> <span id="cb19-8"><a href="#cb19-8"></a><span class="co">#> 4 4842 Rstr 1</span></span> <span id="cb19-9"><a href="#cb19-9"></a><span class="co">#> 5 4842 Base_TD 1</span></span> <span id="cb19-10"><a href="#cb19-10"></a><span class="co">#> 6 4842 BCE 1</span></span> <span id="cb19-11"><a href="#cb19-11"></a><span class="co">#> 7 4842 BCW 1</span></span> <span id="cb19-12"><a href="#cb19-12"></a><span class="co">#> 8 4842 BCE2 1</span></span> <span id="cb19-13"><a href="#cb19-13"></a><span class="co">#> 9 4842 BCW2 1</span></span> <span id="cb19-14"><a href="#cb19-14"></a><span class="co">#> 10 4842 MAE 1</span></span> <span id="cb19-15"><a href="#cb19-15"></a><span class="co">#> # … with 104 more rows</span></span></code></pre></div> <p>Many tools used to analyse this data need it in a form where each station is a column:</p> <div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1"></a>fish_encounters <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_wider</span>(<span class="dt">names_from =</span> station, <span class="dt">values_from =</span> seen)</span> <span id="cb20-2"><a href="#cb20-2"></a><span class="co">#> # A tibble: 19 x 12</span></span> <span id="cb20-3"><a href="#cb20-3"></a><span class="co">#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW</span></span> <span id="cb20-4"><a href="#cb20-4"></a><span class="co">#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int></span></span> <span id="cb20-5"><a href="#cb20-5"></a><span class="co">#> 1 4842 1 1 1 1 1 1 1 1 1 1 1</span></span> <span id="cb20-6"><a href="#cb20-6"></a><span class="co">#> 2 4843 1 1 1 1 1 1 1 1 1 1 1</span></span> <span id="cb20-7"><a href="#cb20-7"></a><span class="co">#> 3 4844 1 1 1 1 1 1 1 1 1 1 1</span></span> <span id="cb20-8"><a href="#cb20-8"></a><span class="co">#> 4 4845 1 1 1 1 1 NA NA NA NA NA NA</span></span> <span id="cb20-9"><a href="#cb20-9"></a><span class="co">#> 5 4847 1 1 1 NA NA NA NA NA NA NA NA</span></span> <span id="cb20-10"><a href="#cb20-10"></a><span class="co">#> 6 4848 1 1 1 1 NA NA NA NA NA NA NA</span></span> <span id="cb20-11"><a href="#cb20-11"></a><span class="co">#> 7 4849 1 1 NA NA NA NA NA NA NA NA NA</span></span> <span id="cb20-12"><a href="#cb20-12"></a><span class="co">#> 8 4850 1 1 NA 1 1 1 1 NA NA NA NA</span></span> <span id="cb20-13"><a href="#cb20-13"></a><span class="co">#> 9 4851 1 1 NA NA NA NA NA NA NA NA NA</span></span> <span id="cb20-14"><a href="#cb20-14"></a><span class="co">#> 10 4854 1 1 NA NA NA NA NA NA NA NA NA</span></span> <span id="cb20-15"><a href="#cb20-15"></a><span class="co">#> # … with 9 more rows</span></span></code></pre></div> <p>This dataset only records when a fish was detected by the station - it doesn’t record when it wasn’t detected (this is common with this type of data). That means the output data is filled with <code>NA</code>s. However, in this case we know that the absence of a record means that the fish was not <code>seen</code>, so we can ask <code>pivot_wider()</code> to fill these missing values in with zeros:</p> <div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1"></a>fish_encounters <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_wider</span>(</span> <span id="cb21-2"><a href="#cb21-2"></a> <span class="dt">names_from =</span> station, </span> <span id="cb21-3"><a href="#cb21-3"></a> <span class="dt">values_from =</span> seen,</span> <span id="cb21-4"><a href="#cb21-4"></a> <span class="dt">values_fill =</span> <span class="dv">0</span></span> <span id="cb21-5"><a href="#cb21-5"></a>)</span> <span id="cb21-6"><a href="#cb21-6"></a><span class="co">#> # A tibble: 19 x 12</span></span> <span id="cb21-7"><a href="#cb21-7"></a><span class="co">#> fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2 MAE MAW</span></span> <span id="cb21-8"><a href="#cb21-8"></a><span class="co">#> <fct> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int></span></span> <span id="cb21-9"><a href="#cb21-9"></a><span class="co">#> 1 4842 1 1 1 1 1 1 1 1 1 1 1</span></span> <span id="cb21-10"><a href="#cb21-10"></a><span class="co">#> 2 4843 1 1 1 1 1 1 1 1 1 1 1</span></span> <span id="cb21-11"><a href="#cb21-11"></a><span class="co">#> 3 4844 1 1 1 1 1 1 1 1 1 1 1</span></span> <span id="cb21-12"><a href="#cb21-12"></a><span class="co">#> 4 4845 1 1 1 1 1 0 0 0 0 0 0</span></span> <span id="cb21-13"><a href="#cb21-13"></a><span class="co">#> 5 4847 1 1 1 0 0 0 0 0 0 0 0</span></span> <span id="cb21-14"><a href="#cb21-14"></a><span class="co">#> 6 4848 1 1 1 1 0 0 0 0 0 0 0</span></span> <span id="cb21-15"><a href="#cb21-15"></a><span class="co">#> 7 4849 1 1 0 0 0 0 0 0 0 0 0</span></span> <span id="cb21-16"><a href="#cb21-16"></a><span class="co">#> 8 4850 1 1 0 1 1 1 1 0 0 0 0</span></span> <span id="cb21-17"><a href="#cb21-17"></a><span class="co">#> 9 4851 1 1 0 0 0 0 0 0 0 0 0</span></span> <span id="cb21-18"><a href="#cb21-18"></a><span class="co">#> 10 4854 1 1 0 0 0 0 0 0 0 0 0</span></span> <span id="cb21-19"><a href="#cb21-19"></a><span class="co">#> # … with 9 more rows</span></span></code></pre></div> </div> <div id="aggregation" class="section level2"> <h2>Aggregation</h2> <p>You can also use <code>pivot_wider()</code> to perform simple aggregation. For example, take the <code>warpbreaks</code> dataset built in to base R (converted to a tibble for the better print method):</p> <div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1"></a>warpbreaks <-<span class="st"> </span>warpbreaks <span class="op">%>%</span><span class="st"> </span><span class="kw">as_tibble</span>() <span class="op">%>%</span><span class="st"> </span><span class="kw">select</span>(wool, tension, breaks)</span> <span id="cb22-2"><a href="#cb22-2"></a>warpbreaks</span> <span id="cb22-3"><a href="#cb22-3"></a><span class="co">#> # A tibble: 54 x 3</span></span> <span id="cb22-4"><a href="#cb22-4"></a><span class="co">#> wool tension breaks</span></span> <span id="cb22-5"><a href="#cb22-5"></a><span class="co">#> <fct> <fct> <dbl></span></span> <span id="cb22-6"><a href="#cb22-6"></a><span class="co">#> 1 A L 26</span></span> <span id="cb22-7"><a href="#cb22-7"></a><span class="co">#> 2 A L 30</span></span> <span id="cb22-8"><a href="#cb22-8"></a><span class="co">#> 3 A L 54</span></span> <span id="cb22-9"><a href="#cb22-9"></a><span class="co">#> 4 A L 25</span></span> <span id="cb22-10"><a href="#cb22-10"></a><span class="co">#> 5 A L 70</span></span> <span id="cb22-11"><a href="#cb22-11"></a><span class="co">#> 6 A L 52</span></span> <span id="cb22-12"><a href="#cb22-12"></a><span class="co">#> 7 A L 51</span></span> <span id="cb22-13"><a href="#cb22-13"></a><span class="co">#> 8 A L 26</span></span> <span id="cb22-14"><a href="#cb22-14"></a><span class="co">#> 9 A L 67</span></span> <span id="cb22-15"><a href="#cb22-15"></a><span class="co">#> 10 A M 18</span></span> <span id="cb22-16"><a href="#cb22-16"></a><span class="co">#> # … with 44 more rows</span></span></code></pre></div> <p>This is a designed experiment with nine replicates for every combination of <code>wool</code> (<code>A</code> and <code>B</code>) and <code>tension</code> (<code>L</code>, <code>M</code>, <code>H</code>):</p> <div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1"></a>warpbreaks <span class="op">%>%</span><span class="st"> </span><span class="kw">count</span>(wool, tension)</span> <span id="cb23-2"><a href="#cb23-2"></a><span class="co">#> # A tibble: 6 x 3</span></span> <span id="cb23-3"><a href="#cb23-3"></a><span class="co">#> wool tension n</span></span> <span id="cb23-4"><a href="#cb23-4"></a><span class="co">#> <fct> <fct> <int></span></span> <span id="cb23-5"><a href="#cb23-5"></a><span class="co">#> 1 A L 9</span></span> <span id="cb23-6"><a href="#cb23-6"></a><span class="co">#> 2 A M 9</span></span> <span id="cb23-7"><a href="#cb23-7"></a><span class="co">#> 3 A H 9</span></span> <span id="cb23-8"><a href="#cb23-8"></a><span class="co">#> 4 B L 9</span></span> <span id="cb23-9"><a href="#cb23-9"></a><span class="co">#> 5 B M 9</span></span> <span id="cb23-10"><a href="#cb23-10"></a><span class="co">#> 6 B H 9</span></span></code></pre></div> <p>What happens if we attempt to pivot the levels of <code>wool</code> into the columns?</p> <div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1"></a>warpbreaks <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_wider</span>(<span class="dt">names_from =</span> wool, <span class="dt">values_from =</span> breaks)</span> <span id="cb24-2"><a href="#cb24-2"></a><span class="co">#> Warning: Values are not uniquely identified; output will contain list-cols.</span></span> <span id="cb24-3"><a href="#cb24-3"></a><span class="co">#> * Use `values_fn = list` to suppress this warning.</span></span> <span id="cb24-4"><a href="#cb24-4"></a><span class="co">#> * Use `values_fn = length` to identify where the duplicates arise</span></span> <span id="cb24-5"><a href="#cb24-5"></a><span class="co">#> * Use `values_fn = {summary_fun}` to summarise duplicates</span></span> <span id="cb24-6"><a href="#cb24-6"></a><span class="co">#> # A tibble: 3 x 3</span></span> <span id="cb24-7"><a href="#cb24-7"></a><span class="co">#> tension A B </span></span> <span id="cb24-8"><a href="#cb24-8"></a><span class="co">#> <fct> <list> <list> </span></span> <span id="cb24-9"><a href="#cb24-9"></a><span class="co">#> 1 L <dbl [9]> <dbl [9]></span></span> <span id="cb24-10"><a href="#cb24-10"></a><span class="co">#> 2 M <dbl [9]> <dbl [9]></span></span> <span id="cb24-11"><a href="#cb24-11"></a><span class="co">#> 3 H <dbl [9]> <dbl [9]></span></span></code></pre></div> <p>We get a warning that each cell in the output corresponds to multiple cells in the input. The default behaviour produces list-columns, which contain all the individual values. A more useful output would be summary statistics, e.g. <code>mean</code> breaks for each combination of wool and tension:</p> <div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1"></a>warpbreaks <span class="op">%>%</span><span class="st"> </span></span> <span id="cb25-2"><a href="#cb25-2"></a><span class="st"> </span><span class="kw">pivot_wider</span>(</span> <span id="cb25-3"><a href="#cb25-3"></a> <span class="dt">names_from =</span> wool, </span> <span id="cb25-4"><a href="#cb25-4"></a> <span class="dt">values_from =</span> breaks,</span> <span id="cb25-5"><a href="#cb25-5"></a> <span class="dt">values_fn =</span> <span class="kw">list</span>(<span class="dt">breaks =</span> mean)</span> <span id="cb25-6"><a href="#cb25-6"></a> )</span> <span id="cb25-7"><a href="#cb25-7"></a><span class="co">#> # A tibble: 3 x 3</span></span> <span id="cb25-8"><a href="#cb25-8"></a><span class="co">#> tension A B</span></span> <span id="cb25-9"><a href="#cb25-9"></a><span class="co">#> <fct> <dbl> <dbl></span></span> <span id="cb25-10"><a href="#cb25-10"></a><span class="co">#> 1 L 44.6 28.2</span></span> <span id="cb25-11"><a href="#cb25-11"></a><span class="co">#> 2 M 24 28.8</span></span> <span id="cb25-12"><a href="#cb25-12"></a><span class="co">#> 3 H 24.6 18.8</span></span></code></pre></div> <p>For more complex summary operations, I recommend summarising before reshaping, but for simple cases it’s often convenient to summarise within <code>pivot_wider()</code>.</p> </div> <div id="generate-column-name-from-multiple-variables" class="section level2"> <h2>Generate column name from multiple variables</h2> <p>Imagine, as in <a href="https://stackoverflow.com/questions/24929954" class="uri">https://stackoverflow.com/questions/24929954</a>, that we have information containing the combination of product, country, and year. In tidy form it might look like this:</p> <div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1"></a>production <-<span class="st"> </span><span class="kw">expand_grid</span>(</span> <span id="cb26-2"><a href="#cb26-2"></a> <span class="dt">product =</span> <span class="kw">c</span>(<span class="st">"A"</span>, <span class="st">"B"</span>), </span> <span id="cb26-3"><a href="#cb26-3"></a> <span class="dt">country =</span> <span class="kw">c</span>(<span class="st">"AI"</span>, <span class="st">"EI"</span>), </span> <span id="cb26-4"><a href="#cb26-4"></a> <span class="dt">year =</span> <span class="dv">2000</span><span class="op">:</span><span class="dv">2014</span></span> <span id="cb26-5"><a href="#cb26-5"></a> ) <span class="op">%>%</span></span> <span id="cb26-6"><a href="#cb26-6"></a><span class="st"> </span><span class="kw">filter</span>((product <span class="op">==</span><span class="st"> "A"</span> <span class="op">&</span><span class="st"> </span>country <span class="op">==</span><span class="st"> "AI"</span>) <span class="op">|</span><span class="st"> </span>product <span class="op">==</span><span class="st"> "B"</span>) <span class="op">%>%</span><span class="st"> </span></span> <span id="cb26-7"><a href="#cb26-7"></a><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">production =</span> <span class="kw">rnorm</span>(<span class="kw">nrow</span>(.)))</span> <span id="cb26-8"><a href="#cb26-8"></a>production</span> <span id="cb26-9"><a href="#cb26-9"></a><span class="co">#> # A tibble: 45 x 4</span></span> <span id="cb26-10"><a href="#cb26-10"></a><span class="co">#> product country year production</span></span> <span id="cb26-11"><a href="#cb26-11"></a><span class="co">#> <chr> <chr> <int> <dbl></span></span> <span id="cb26-12"><a href="#cb26-12"></a><span class="co">#> 1 A AI 2000 -0.399 </span></span> <span id="cb26-13"><a href="#cb26-13"></a><span class="co">#> 2 A AI 2001 -1.28 </span></span> <span id="cb26-14"><a href="#cb26-14"></a><span class="co">#> 3 A AI 2002 0.162 </span></span> <span id="cb26-15"><a href="#cb26-15"></a><span class="co">#> 4 A AI 2003 2.54 </span></span> <span id="cb26-16"><a href="#cb26-16"></a><span class="co">#> 5 A AI 2004 1.58 </span></span> <span id="cb26-17"><a href="#cb26-17"></a><span class="co">#> 6 A AI 2005 0.0607</span></span> <span id="cb26-18"><a href="#cb26-18"></a><span class="co">#> 7 A AI 2006 -0.827 </span></span> <span id="cb26-19"><a href="#cb26-19"></a><span class="co">#> 8 A AI 2007 -0.649 </span></span> <span id="cb26-20"><a href="#cb26-20"></a><span class="co">#> 9 A AI 2008 -0.921 </span></span> <span id="cb26-21"><a href="#cb26-21"></a><span class="co">#> 10 A AI 2009 0.823 </span></span> <span id="cb26-22"><a href="#cb26-22"></a><span class="co">#> # … with 35 more rows</span></span></code></pre></div> <p>We want to widen the data so we have one column for each combination of <code>product</code> and <code>country</code>. The key is to specify multiple variables for <code>names_from</code>:</p> <div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1"></a>production <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_wider</span>(</span> <span id="cb27-2"><a href="#cb27-2"></a> <span class="dt">names_from =</span> <span class="kw">c</span>(product, country), </span> <span id="cb27-3"><a href="#cb27-3"></a> <span class="dt">values_from =</span> production</span> <span id="cb27-4"><a href="#cb27-4"></a>)</span> <span id="cb27-5"><a href="#cb27-5"></a><span class="co">#> # A tibble: 15 x 4</span></span> <span id="cb27-6"><a href="#cb27-6"></a><span class="co">#> year A_AI B_AI B_EI</span></span> <span id="cb27-7"><a href="#cb27-7"></a><span class="co">#> <int> <dbl> <dbl> <dbl></span></span> <span id="cb27-8"><a href="#cb27-8"></a><span class="co">#> 1 2000 -0.399 0.410 0.300</span></span> <span id="cb27-9"><a href="#cb27-9"></a><span class="co">#> 2 2001 -1.28 0.358 0.972</span></span> <span id="cb27-10"><a href="#cb27-10"></a><span class="co">#> 3 2002 0.162 0.500 -1.27 </span></span> <span id="cb27-11"><a href="#cb27-11"></a><span class="co">#> 4 2003 2.54 0.823 0.246</span></span> <span id="cb27-12"><a href="#cb27-12"></a><span class="co">#> 5 2004 1.58 0.376 0.751</span></span> <span id="cb27-13"><a href="#cb27-13"></a><span class="co">#> 6 2005 0.0607 -1.30 1.16 </span></span> <span id="cb27-14"><a href="#cb27-14"></a><span class="co">#> 7 2006 -0.827 1.37 -0.934</span></span> <span id="cb27-15"><a href="#cb27-15"></a><span class="co">#> 8 2007 -0.649 -0.214 0.625</span></span> <span id="cb27-16"><a href="#cb27-16"></a><span class="co">#> 9 2008 -0.921 -0.694 1.09 </span></span> <span id="cb27-17"><a href="#cb27-17"></a><span class="co">#> 10 2009 0.823 0.103 -2.99 </span></span> <span id="cb27-18"><a href="#cb27-18"></a><span class="co">#> # … with 5 more rows</span></span></code></pre></div> <p>When either <code>names_from</code> or <code>values_from</code> select multiple variables, you can control how the column names in the output constructed with <code>names_sep</code> and <code>names_prefix</code>, or the workhorse <code>names_glue</code>:</p> <div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1"></a>production <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_wider</span>(</span> <span id="cb28-2"><a href="#cb28-2"></a> <span class="dt">names_from =</span> <span class="kw">c</span>(product, country), </span> <span id="cb28-3"><a href="#cb28-3"></a> <span class="dt">values_from =</span> production,</span> <span id="cb28-4"><a href="#cb28-4"></a> <span class="dt">names_sep =</span> <span class="st">"."</span>,</span> <span id="cb28-5"><a href="#cb28-5"></a> <span class="dt">names_prefix =</span> <span class="st">"prod."</span></span> <span id="cb28-6"><a href="#cb28-6"></a>)</span> <span id="cb28-7"><a href="#cb28-7"></a><span class="co">#> # A tibble: 15 x 4</span></span> <span id="cb28-8"><a href="#cb28-8"></a><span class="co">#> year prod.A.AI prod.B.AI prod.B.EI</span></span> <span id="cb28-9"><a href="#cb28-9"></a><span class="co">#> <int> <dbl> <dbl> <dbl></span></span> <span id="cb28-10"><a href="#cb28-10"></a><span class="co">#> 1 2000 -0.399 0.410 0.300</span></span> <span id="cb28-11"><a href="#cb28-11"></a><span class="co">#> 2 2001 -1.28 0.358 0.972</span></span> <span id="cb28-12"><a href="#cb28-12"></a><span class="co">#> 3 2002 0.162 0.500 -1.27 </span></span> <span id="cb28-13"><a href="#cb28-13"></a><span class="co">#> 4 2003 2.54 0.823 0.246</span></span> <span id="cb28-14"><a href="#cb28-14"></a><span class="co">#> 5 2004 1.58 0.376 0.751</span></span> <span id="cb28-15"><a href="#cb28-15"></a><span class="co">#> 6 2005 0.0607 -1.30 1.16 </span></span> <span id="cb28-16"><a href="#cb28-16"></a><span class="co">#> 7 2006 -0.827 1.37 -0.934</span></span> <span id="cb28-17"><a href="#cb28-17"></a><span class="co">#> 8 2007 -0.649 -0.214 0.625</span></span> <span id="cb28-18"><a href="#cb28-18"></a><span class="co">#> 9 2008 -0.921 -0.694 1.09 </span></span> <span id="cb28-19"><a href="#cb28-19"></a><span class="co">#> 10 2009 0.823 0.103 -2.99 </span></span> <span id="cb28-20"><a href="#cb28-20"></a><span class="co">#> # … with 5 more rows</span></span> <span id="cb28-21"><a href="#cb28-21"></a></span> <span id="cb28-22"><a href="#cb28-22"></a>production <span class="op">%>%</span><span class="st"> </span><span class="kw">pivot_wider</span>(</span> <span id="cb28-23"><a href="#cb28-23"></a> <span class="dt">names_from =</span> <span class="kw">c</span>(product, country), </span> <span id="cb28-24"><a href="#cb28-24"></a> <span class="dt">values_from =</span> production,</span> <span id="cb28-25"><a href="#cb28-25"></a> <span class="dt">names_glue =</span> <span class="st">"prod_{product}_{country}"</span></span> <span id="cb28-26"><a href="#cb28-26"></a>)</span> <span id="cb28-27"><a href="#cb28-27"></a><span class="co">#> # A tibble: 15 x 4</span></span> <span id="cb28-28"><a href="#cb28-28"></a><span class="co">#> year prod_A_AI prod_B_AI prod_B_EI</span></span> <span id="cb28-29"><a href="#cb28-29"></a><span class="co">#> <int> <dbl> <dbl> <dbl></span></span> <span id="cb28-30"><a href="#cb28-30"></a><span class="co">#> 1 2000 -0.399 0.410 0.300</span></span> <span id="cb28-31"><a href="#cb28-31"></a><span class="co">#> 2 2001 -1.28 0.358 0.972</span></span> <span id="cb28-32"><a href="#cb28-32"></a><span class="co">#> 3 2002 0.162 0.500 -1.27 </span></span> <span id="cb28-33"><a href="#cb28-33"></a><span class="co">#> 4 2003 2.54 0.823 0.246</span></span> <span id="cb28-34"><a href="#cb28-34"></a><span class="co">#> 5 2004 1.58 0.376 0.751</span></span> <span id="cb28-35"><a href="#cb28-35"></a><span class="co">#> 6 2005 0.0607 -1.30 1.16 </span></span> <span id="cb28-36"><a href="#cb28-36"></a><span class="co">#> 7 2006 -0.827 1.37 -0.934</span></span> <span id="cb28-37"><a href="#cb28-37"></a><span class="co">#> 8 2007 -0.649 -0.214 0.625</span></span> <span id="cb28-38"><a href="#cb28-38"></a><span class="co">#> 9 2008 -0.921 -0.694 1.09 </span></span> <span id="cb28-39"><a href="#cb28-39"></a><span class="co">#> 10 2009 0.823 0.103 -2.99 </span></span> <span id="cb28-40"><a href="#cb28-40"></a><span class="co">#> # … with 5 more rows</span></span></code></pre></div> </div> <div id="tidy-census" class="section level2"> <h2>Tidy census</h2> <p>The <code>us_rent_income</code> dataset contains information about median income and rent for each state in the US for 2017 (from the American Community Survey, retrieved with the <a href="https://walker-data.com/tidycensus/">tidycensus</a> package).</p> <div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1"></a>us_rent_income</span> <span id="cb29-2"><a href="#cb29-2"></a><span class="co">#> # A tibble: 104 x 5</span></span> <span id="cb29-3"><a href="#cb29-3"></a><span class="co">#> GEOID NAME variable estimate moe</span></span> <span id="cb29-4"><a href="#cb29-4"></a><span class="co">#> <chr> <chr> <chr> <dbl> <dbl></span></span> <span id="cb29-5"><a href="#cb29-5"></a><span class="co">#> 1 01 Alabama income 24476 136</span></span> <span id="cb29-6"><a href="#cb29-6"></a><span class="co">#> 2 01 Alabama rent 747 3</span></span> <span id="cb29-7"><a href="#cb29-7"></a><span class="co">#> 3 02 Alaska income 32940 508</span></span> <span id="cb29-8"><a href="#cb29-8"></a><span class="co">#> 4 02 Alaska rent 1200 13</span></span> <span id="cb29-9"><a href="#cb29-9"></a><span class="co">#> 5 04 Arizona income 27517 148</span></span> <span id="cb29-10"><a href="#cb29-10"></a><span class="co">#> 6 04 Arizona rent 972 4</span></span> <span id="cb29-11"><a href="#cb29-11"></a><span class="co">#> 7 05 Arkansas income 23789 165</span></span> <span id="cb29-12"><a href="#cb29-12"></a><span class="co">#> 8 05 Arkansas rent 709 5</span></span> <span id="cb29-13"><a href="#cb29-13"></a><span class="co">#> 9 06 California income 29454 109</span></span> <span id="cb29-14"><a href="#cb29-14"></a><span class="co">#> 10 06 California rent 1358 3</span></span> <span id="cb29-15"><a href="#cb29-15"></a><span class="co">#> # … with 94 more rows</span></span></code></pre></div> <p>Here both <code>estimate</code> and <code>moe</code> are values columns, so we can supply them to <code>values_from</code>:</p> <div class="sourceCode" id="cb30"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1"></a>us_rent_income <span class="op">%>%</span><span class="st"> </span></span> <span id="cb30-2"><a href="#cb30-2"></a><span class="st"> </span><span class="kw">pivot_wider</span>(<span class="dt">names_from =</span> variable, <span class="dt">values_from =</span> <span class="kw">c</span>(estimate, moe))</span> <span id="cb30-3"><a href="#cb30-3"></a><span class="co">#> # A tibble: 52 x 6</span></span> <span id="cb30-4"><a href="#cb30-4"></a><span class="co">#> GEOID NAME estimate_income estimate_rent moe_income moe_rent</span></span> <span id="cb30-5"><a href="#cb30-5"></a><span class="co">#> <chr> <chr> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb30-6"><a href="#cb30-6"></a><span class="co">#> 1 01 Alabama 24476 747 136 3</span></span> <span id="cb30-7"><a href="#cb30-7"></a><span class="co">#> 2 02 Alaska 32940 1200 508 13</span></span> <span id="cb30-8"><a href="#cb30-8"></a><span class="co">#> 3 04 Arizona 27517 972 148 4</span></span> <span id="cb30-9"><a href="#cb30-9"></a><span class="co">#> 4 05 Arkansas 23789 709 165 5</span></span> <span id="cb30-10"><a href="#cb30-10"></a><span class="co">#> 5 06 California 29454 1358 109 3</span></span> <span id="cb30-11"><a href="#cb30-11"></a><span class="co">#> 6 08 Colorado 32401 1125 109 5</span></span> <span id="cb30-12"><a href="#cb30-12"></a><span class="co">#> 7 09 Connecticut 35326 1123 195 5</span></span> <span id="cb30-13"><a href="#cb30-13"></a><span class="co">#> 8 10 Delaware 31560 1076 247 10</span></span> <span id="cb30-14"><a href="#cb30-14"></a><span class="co">#> 9 11 District of Columbia 43198 1424 681 17</span></span> <span id="cb30-15"><a href="#cb30-15"></a><span class="co">#> 10 12 Florida 25952 1077 70 3</span></span> <span id="cb30-16"><a href="#cb30-16"></a><span class="co">#> # … with 42 more rows</span></span></code></pre></div> <p>Note that the name of the variable is automatically appended to the output columns.</p> </div> <div id="contact-list" class="section level2"> <h2>Contact list</h2> <p>A final challenge is inspired by <a href="https://github.com/jienagu/tidyverse_examples/blob/master/example_long_wide.R">Jiena Gu</a>. Imagine you have a contact list that you’ve copied and pasted from a website:</p> <div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1"></a>contacts <-<span class="st"> </span><span class="kw">tribble</span>(</span> <span id="cb31-2"><a href="#cb31-2"></a> <span class="op">~</span>field, <span class="op">~</span>value,</span> <span id="cb31-3"><a href="#cb31-3"></a> <span class="st">"name"</span>, <span class="st">"Jiena McLellan"</span>,</span> <span id="cb31-4"><a href="#cb31-4"></a> <span class="st">"company"</span>, <span class="st">"Toyota"</span>, </span> <span id="cb31-5"><a href="#cb31-5"></a> <span class="st">"name"</span>, <span class="st">"John Smith"</span>, </span> <span id="cb31-6"><a href="#cb31-6"></a> <span class="st">"company"</span>, <span class="st">"google"</span>, </span> <span id="cb31-7"><a href="#cb31-7"></a> <span class="st">"email"</span>, <span class="st">"john@google.com"</span>,</span> <span id="cb31-8"><a href="#cb31-8"></a> <span class="st">"name"</span>, <span class="st">"Huxley Ratcliffe"</span></span> <span id="cb31-9"><a href="#cb31-9"></a>)</span></code></pre></div> <p>This is challenging because there’s no variable that identifies which observations belong together. We can fix this by noting that every contact starts with a name, so we can create a unique id by counting every time we see “name” as the <code>field</code>:</p> <div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1"></a>contacts <-<span class="st"> </span>contacts <span class="op">%>%</span><span class="st"> </span></span> <span id="cb32-2"><a href="#cb32-2"></a><span class="st"> </span><span class="kw">mutate</span>(</span> <span id="cb32-3"><a href="#cb32-3"></a> <span class="dt">person_id =</span> <span class="kw">cumsum</span>(field <span class="op">==</span><span class="st"> "name"</span>)</span> <span id="cb32-4"><a href="#cb32-4"></a> )</span> <span id="cb32-5"><a href="#cb32-5"></a>contacts</span> <span id="cb32-6"><a href="#cb32-6"></a><span class="co">#> # A tibble: 6 x 3</span></span> <span id="cb32-7"><a href="#cb32-7"></a><span class="co">#> field value person_id</span></span> <span id="cb32-8"><a href="#cb32-8"></a><span class="co">#> <chr> <chr> <int></span></span> <span id="cb32-9"><a href="#cb32-9"></a><span class="co">#> 1 name Jiena McLellan 1</span></span> <span id="cb32-10"><a href="#cb32-10"></a><span class="co">#> 2 company Toyota 1</span></span> <span id="cb32-11"><a href="#cb32-11"></a><span class="co">#> 3 name John Smith 2</span></span> <span id="cb32-12"><a href="#cb32-12"></a><span class="co">#> 4 company google 2</span></span> <span id="cb32-13"><a href="#cb32-13"></a><span class="co">#> 5 email john@google.com 2</span></span> <span id="cb32-14"><a href="#cb32-14"></a><span class="co">#> 6 name Huxley Ratcliffe 3</span></span></code></pre></div> <p>Now that we have a unique identifier for each person, we can pivot <code>field</code> and <code>value</code> into the columns:</p> <div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1"></a>contacts <span class="op">%>%</span><span class="st"> </span></span> <span id="cb33-2"><a href="#cb33-2"></a><span class="st"> </span><span class="kw">pivot_wider</span>(<span class="dt">names_from =</span> field, <span class="dt">values_from =</span> value)</span> <span id="cb33-3"><a href="#cb33-3"></a><span class="co">#> # A tibble: 3 x 4</span></span> <span id="cb33-4"><a href="#cb33-4"></a><span class="co">#> person_id name company email </span></span> <span id="cb33-5"><a href="#cb33-5"></a><span class="co">#> <int> <chr> <chr> <chr> </span></span> <span id="cb33-6"><a href="#cb33-6"></a><span class="co">#> 1 1 Jiena McLellan Toyota <NA> </span></span> <span id="cb33-7"><a href="#cb33-7"></a><span class="co">#> 2 2 John Smith google john@google.com</span></span> <span id="cb33-8"><a href="#cb33-8"></a><span class="co">#> 3 3 Huxley Ratcliffe <NA> <NA></span></span></code></pre></div> </div> </div> <div id="longer-then-wider" class="section level1"> <h1>Longer, then wider</h1> <p>Some problems can’t be solved by pivotting in a single direction. The examples in this section show how you might combine <code>pivot_longer()</code> and <code>pivot_wider()</code> to solve more complex problems.</p> <div id="world-bank" class="section level2"> <h2>World bank</h2> <p><code>world_bank_pop</code> contains data from the World Bank about population per country from 2000 to 2018.</p> <div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1"></a>world_bank_pop</span> <span id="cb34-2"><a href="#cb34-2"></a><span class="co">#> # A tibble: 1,056 x 20</span></span> <span id="cb34-3"><a href="#cb34-3"></a><span class="co">#> country indicator `2000` `2001` `2002` `2003` `2004` `2005` `2006`</span></span> <span id="cb34-4"><a href="#cb34-4"></a><span class="co">#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb34-5"><a href="#cb34-5"></a><span class="co">#> 1 ABW SP.URB.T… 4.24e4 4.30e4 4.37e4 4.42e4 4.47e+4 4.49e+4 4.49e+4</span></span> <span id="cb34-6"><a href="#cb34-6"></a><span class="co">#> 2 ABW SP.URB.G… 1.18e0 1.41e0 1.43e0 1.31e0 9.51e-1 4.91e-1 -1.78e-2</span></span> <span id="cb34-7"><a href="#cb34-7"></a><span class="co">#> 3 ABW SP.POP.T… 9.09e4 9.29e4 9.50e4 9.70e4 9.87e+4 1.00e+5 1.01e+5</span></span> <span id="cb34-8"><a href="#cb34-8"></a><span class="co">#> 4 ABW SP.POP.G… 2.06e0 2.23e0 2.23e0 2.11e0 1.76e+0 1.30e+0 7.98e-1</span></span> <span id="cb34-9"><a href="#cb34-9"></a><span class="co">#> 5 AFG SP.URB.T… 4.44e6 4.65e6 4.89e6 5.16e6 5.43e+6 5.69e+6 5.93e+6</span></span> <span id="cb34-10"><a href="#cb34-10"></a><span class="co">#> 6 AFG SP.URB.G… 3.91e0 4.66e0 5.13e0 5.23e0 5.12e+0 4.77e+0 4.12e+0</span></span> <span id="cb34-11"><a href="#cb34-11"></a><span class="co">#> 7 AFG SP.POP.T… 2.01e7 2.10e7 2.20e7 2.31e7 2.41e+7 2.51e+7 2.59e+7</span></span> <span id="cb34-12"><a href="#cb34-12"></a><span class="co">#> 8 AFG SP.POP.G… 3.49e0 4.25e0 4.72e0 4.82e0 4.47e+0 3.87e+0 3.23e+0</span></span> <span id="cb34-13"><a href="#cb34-13"></a><span class="co">#> 9 AGO SP.URB.T… 8.23e6 8.71e6 9.22e6 9.77e6 1.03e+7 1.09e+7 1.15e+7</span></span> <span id="cb34-14"><a href="#cb34-14"></a><span class="co">#> 10 AGO SP.URB.G… 5.44e0 5.59e0 5.70e0 5.76e0 5.75e+0 5.69e+0 4.92e+0</span></span> <span id="cb34-15"><a href="#cb34-15"></a><span class="co">#> # … with 1,046 more rows, and 11 more variables: `2007` <dbl>, `2008` <dbl>,</span></span> <span id="cb34-16"><a href="#cb34-16"></a><span class="co">#> # `2009` <dbl>, `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, `2013` <dbl>,</span></span> <span id="cb34-17"><a href="#cb34-17"></a><span class="co">#> # `2014` <dbl>, `2015` <dbl>, `2016` <dbl>, `2017` <dbl></span></span></code></pre></div> <p>My goal is to produce a tidy dataset where each variable is in a column. It’s not obvious exactly what steps are needed yet, but I’ll start with the most obvious problem: year is spread across multiple columns.</p> <div class="sourceCode" id="cb35"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1"></a>pop2 <-<span class="st"> </span>world_bank_pop <span class="op">%>%</span><span class="st"> </span></span> <span id="cb35-2"><a href="#cb35-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(<span class="st">`</span><span class="dt">2000</span><span class="st">`</span><span class="op">:</span><span class="st">`</span><span class="dt">2017</span><span class="st">`</span>, <span class="dt">names_to =</span> <span class="st">"year"</span>, <span class="dt">values_to =</span> <span class="st">"value"</span>)</span> <span id="cb35-3"><a href="#cb35-3"></a>pop2</span> <span id="cb35-4"><a href="#cb35-4"></a><span class="co">#> # A tibble: 19,008 x 4</span></span> <span id="cb35-5"><a href="#cb35-5"></a><span class="co">#> country indicator year value</span></span> <span id="cb35-6"><a href="#cb35-6"></a><span class="co">#> <chr> <chr> <chr> <dbl></span></span> <span id="cb35-7"><a href="#cb35-7"></a><span class="co">#> 1 ABW SP.URB.TOTL 2000 42444</span></span> <span id="cb35-8"><a href="#cb35-8"></a><span class="co">#> 2 ABW SP.URB.TOTL 2001 43048</span></span> <span id="cb35-9"><a href="#cb35-9"></a><span class="co">#> 3 ABW SP.URB.TOTL 2002 43670</span></span> <span id="cb35-10"><a href="#cb35-10"></a><span class="co">#> 4 ABW SP.URB.TOTL 2003 44246</span></span> <span id="cb35-11"><a href="#cb35-11"></a><span class="co">#> 5 ABW SP.URB.TOTL 2004 44669</span></span> <span id="cb35-12"><a href="#cb35-12"></a><span class="co">#> 6 ABW SP.URB.TOTL 2005 44889</span></span> <span id="cb35-13"><a href="#cb35-13"></a><span class="co">#> 7 ABW SP.URB.TOTL 2006 44881</span></span> <span id="cb35-14"><a href="#cb35-14"></a><span class="co">#> 8 ABW SP.URB.TOTL 2007 44686</span></span> <span id="cb35-15"><a href="#cb35-15"></a><span class="co">#> 9 ABW SP.URB.TOTL 2008 44375</span></span> <span id="cb35-16"><a href="#cb35-16"></a><span class="co">#> 10 ABW SP.URB.TOTL 2009 44052</span></span> <span id="cb35-17"><a href="#cb35-17"></a><span class="co">#> # … with 18,998 more rows</span></span></code></pre></div> <p>Next we need to consider the <code>indicator</code> variable:</p> <div class="sourceCode" id="cb36"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1"></a>pop2 <span class="op">%>%</span><span class="st"> </span><span class="kw">count</span>(indicator)</span> <span id="cb36-2"><a href="#cb36-2"></a><span class="co">#> # A tibble: 4 x 2</span></span> <span id="cb36-3"><a href="#cb36-3"></a><span class="co">#> indicator n</span></span> <span id="cb36-4"><a href="#cb36-4"></a><span class="co">#> <chr> <int></span></span> <span id="cb36-5"><a href="#cb36-5"></a><span class="co">#> 1 SP.POP.GROW 4752</span></span> <span id="cb36-6"><a href="#cb36-6"></a><span class="co">#> 2 SP.POP.TOTL 4752</span></span> <span id="cb36-7"><a href="#cb36-7"></a><span class="co">#> 3 SP.URB.GROW 4752</span></span> <span id="cb36-8"><a href="#cb36-8"></a><span class="co">#> 4 SP.URB.TOTL 4752</span></span></code></pre></div> <p>Here <code>SP.POP.GROW</code> is population growth, <code>SP.POP.TOTL</code> is total population, and <code>SP.URB.*</code> are the same but only for urban areas. Let’s split this up into two variables: <code>area</code> (total or urban) and the actual variable (population or growth):</p> <div class="sourceCode" id="cb37"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1"></a>pop3 <-<span class="st"> </span>pop2 <span class="op">%>%</span><span class="st"> </span></span> <span id="cb37-2"><a href="#cb37-2"></a><span class="st"> </span><span class="kw">separate</span>(indicator, <span class="kw">c</span>(<span class="ot">NA</span>, <span class="st">"area"</span>, <span class="st">"variable"</span>))</span> <span id="cb37-3"><a href="#cb37-3"></a>pop3</span> <span id="cb37-4"><a href="#cb37-4"></a><span class="co">#> # A tibble: 19,008 x 5</span></span> <span id="cb37-5"><a href="#cb37-5"></a><span class="co">#> country area variable year value</span></span> <span id="cb37-6"><a href="#cb37-6"></a><span class="co">#> <chr> <chr> <chr> <chr> <dbl></span></span> <span id="cb37-7"><a href="#cb37-7"></a><span class="co">#> 1 ABW URB TOTL 2000 42444</span></span> <span id="cb37-8"><a href="#cb37-8"></a><span class="co">#> 2 ABW URB TOTL 2001 43048</span></span> <span id="cb37-9"><a href="#cb37-9"></a><span class="co">#> 3 ABW URB TOTL 2002 43670</span></span> <span id="cb37-10"><a href="#cb37-10"></a><span class="co">#> 4 ABW URB TOTL 2003 44246</span></span> <span id="cb37-11"><a href="#cb37-11"></a><span class="co">#> 5 ABW URB TOTL 2004 44669</span></span> <span id="cb37-12"><a href="#cb37-12"></a><span class="co">#> 6 ABW URB TOTL 2005 44889</span></span> <span id="cb37-13"><a href="#cb37-13"></a><span class="co">#> 7 ABW URB TOTL 2006 44881</span></span> <span id="cb37-14"><a href="#cb37-14"></a><span class="co">#> 8 ABW URB TOTL 2007 44686</span></span> <span id="cb37-15"><a href="#cb37-15"></a><span class="co">#> 9 ABW URB TOTL 2008 44375</span></span> <span id="cb37-16"><a href="#cb37-16"></a><span class="co">#> 10 ABW URB TOTL 2009 44052</span></span> <span id="cb37-17"><a href="#cb37-17"></a><span class="co">#> # … with 18,998 more rows</span></span></code></pre></div> <p>Now we can complete the tidying by pivoting <code>variable</code> and <code>value</code> to make <code>TOTL</code> and <code>GROW</code> columns:</p> <div class="sourceCode" id="cb38"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1"></a>pop3 <span class="op">%>%</span><span class="st"> </span></span> <span id="cb38-2"><a href="#cb38-2"></a><span class="st"> </span><span class="kw">pivot_wider</span>(<span class="dt">names_from =</span> variable, <span class="dt">values_from =</span> value)</span> <span id="cb38-3"><a href="#cb38-3"></a><span class="co">#> # A tibble: 9,504 x 5</span></span> <span id="cb38-4"><a href="#cb38-4"></a><span class="co">#> country area year TOTL GROW</span></span> <span id="cb38-5"><a href="#cb38-5"></a><span class="co">#> <chr> <chr> <chr> <dbl> <dbl></span></span> <span id="cb38-6"><a href="#cb38-6"></a><span class="co">#> 1 ABW URB 2000 42444 1.18 </span></span> <span id="cb38-7"><a href="#cb38-7"></a><span class="co">#> 2 ABW URB 2001 43048 1.41 </span></span> <span id="cb38-8"><a href="#cb38-8"></a><span class="co">#> 3 ABW URB 2002 43670 1.43 </span></span> <span id="cb38-9"><a href="#cb38-9"></a><span class="co">#> 4 ABW URB 2003 44246 1.31 </span></span> <span id="cb38-10"><a href="#cb38-10"></a><span class="co">#> 5 ABW URB 2004 44669 0.951 </span></span> <span id="cb38-11"><a href="#cb38-11"></a><span class="co">#> 6 ABW URB 2005 44889 0.491 </span></span> <span id="cb38-12"><a href="#cb38-12"></a><span class="co">#> 7 ABW URB 2006 44881 -0.0178</span></span> <span id="cb38-13"><a href="#cb38-13"></a><span class="co">#> 8 ABW URB 2007 44686 -0.435 </span></span> <span id="cb38-14"><a href="#cb38-14"></a><span class="co">#> 9 ABW URB 2008 44375 -0.698 </span></span> <span id="cb38-15"><a href="#cb38-15"></a><span class="co">#> 10 ABW URB 2009 44052 -0.731 </span></span> <span id="cb38-16"><a href="#cb38-16"></a><span class="co">#> # … with 9,494 more rows</span></span></code></pre></div> </div> <div id="multi-choice" class="section level2"> <h2>Multi-choice</h2> <p>Based on a suggestion by <a href="https://github.com/MaximeWack">Maxime Wack</a>, <a href="https://github.com/tidyverse/tidyr/issues/384" class="uri">https://github.com/tidyverse/tidyr/issues/384</a>), the final example shows how to deal with a common way of recording multiple choice data. Often you will get such data as follows:</p> <div class="sourceCode" id="cb39"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb39-1"><a href="#cb39-1"></a>multi <-<span class="st"> </span><span class="kw">tribble</span>(</span> <span id="cb39-2"><a href="#cb39-2"></a> <span class="op">~</span>id, <span class="op">~</span>choice1, <span class="op">~</span>choice2, <span class="op">~</span>choice3,</span> <span id="cb39-3"><a href="#cb39-3"></a> <span class="dv">1</span>, <span class="st">"A"</span>, <span class="st">"B"</span>, <span class="st">"C"</span>,</span> <span id="cb39-4"><a href="#cb39-4"></a> <span class="dv">2</span>, <span class="st">"C"</span>, <span class="st">"B"</span>, <span class="ot">NA</span>,</span> <span id="cb39-5"><a href="#cb39-5"></a> <span class="dv">3</span>, <span class="st">"D"</span>, <span class="ot">NA</span>, <span class="ot">NA</span>,</span> <span id="cb39-6"><a href="#cb39-6"></a> <span class="dv">4</span>, <span class="st">"B"</span>, <span class="st">"D"</span>, <span class="ot">NA</span></span> <span id="cb39-7"><a href="#cb39-7"></a>)</span></code></pre></div> <p>But the actual order isn’t important, and you’d prefer to have the individual questions in the columns. You can achieve the desired transformation in two steps. First, you make the data longer, eliminating the explcit <code>NA</code>s, and adding a column to indicate that this choice was chosen:</p> <div class="sourceCode" id="cb40"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1"></a>multi2 <-<span class="st"> </span>multi <span class="op">%>%</span><span class="st"> </span></span> <span id="cb40-2"><a href="#cb40-2"></a><span class="st"> </span><span class="kw">pivot_longer</span>(<span class="op">!</span>id, <span class="dt">values_drop_na =</span> <span class="ot">TRUE</span>) <span class="op">%>%</span><span class="st"> </span></span> <span id="cb40-3"><a href="#cb40-3"></a><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">checked =</span> <span class="ot">TRUE</span>)</span> <span id="cb40-4"><a href="#cb40-4"></a>multi2</span> <span id="cb40-5"><a href="#cb40-5"></a><span class="co">#> # A tibble: 8 x 4</span></span> <span id="cb40-6"><a href="#cb40-6"></a><span class="co">#> id name value checked</span></span> <span id="cb40-7"><a href="#cb40-7"></a><span class="co">#> <dbl> <chr> <chr> <lgl> </span></span> <span id="cb40-8"><a href="#cb40-8"></a><span class="co">#> 1 1 choice1 A TRUE </span></span> <span id="cb40-9"><a href="#cb40-9"></a><span class="co">#> 2 1 choice2 B TRUE </span></span> <span id="cb40-10"><a href="#cb40-10"></a><span class="co">#> 3 1 choice3 C TRUE </span></span> <span id="cb40-11"><a href="#cb40-11"></a><span class="co">#> 4 2 choice1 C TRUE </span></span> <span id="cb40-12"><a href="#cb40-12"></a><span class="co">#> 5 2 choice2 B TRUE </span></span> <span id="cb40-13"><a href="#cb40-13"></a><span class="co">#> 6 3 choice1 D TRUE </span></span> <span id="cb40-14"><a href="#cb40-14"></a><span class="co">#> 7 4 choice1 B TRUE </span></span> <span id="cb40-15"><a href="#cb40-15"></a><span class="co">#> 8 4 choice2 D TRUE</span></span></code></pre></div> <p>Then you make the data wider, filling in the missing observations with <code>FALSE</code>:</p> <div class="sourceCode" id="cb41"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb41-1"><a href="#cb41-1"></a>multi2 <span class="op">%>%</span><span class="st"> </span></span> <span id="cb41-2"><a href="#cb41-2"></a><span class="st"> </span><span class="kw">pivot_wider</span>(</span> <span id="cb41-3"><a href="#cb41-3"></a> <span class="dt">id_cols =</span> id,</span> <span id="cb41-4"><a href="#cb41-4"></a> <span class="dt">names_from =</span> value, </span> <span id="cb41-5"><a href="#cb41-5"></a> <span class="dt">values_from =</span> checked, </span> <span id="cb41-6"><a href="#cb41-6"></a> <span class="dt">values_fill =</span> <span class="ot">FALSE</span></span> <span id="cb41-7"><a href="#cb41-7"></a> )</span> <span id="cb41-8"><a href="#cb41-8"></a><span class="co">#> # A tibble: 4 x 5</span></span> <span id="cb41-9"><a href="#cb41-9"></a><span class="co">#> id A B C D </span></span> <span id="cb41-10"><a href="#cb41-10"></a><span class="co">#> <dbl> <lgl> <lgl> <lgl> <lgl></span></span> <span id="cb41-11"><a href="#cb41-11"></a><span class="co">#> 1 1 TRUE TRUE TRUE FALSE</span></span> <span id="cb41-12"><a href="#cb41-12"></a><span class="co">#> 2 2 FALSE TRUE TRUE FALSE</span></span> <span id="cb41-13"><a href="#cb41-13"></a><span class="co">#> 3 3 FALSE FALSE FALSE TRUE </span></span> <span id="cb41-14"><a href="#cb41-14"></a><span class="co">#> 4 4 FALSE TRUE FALSE TRUE</span></span></code></pre></div> </div> </div> <div id="manual-specs" class="section level1"> <h1>Manual specs</h1> <p>The arguments to <code>pivot_longer()</code> and <code>pivot_wider()</code> allow you to pivot a wide range of datasets. But the creativity that people apply to their data structures is seemingly endless, so it’s quite possible that you will encounter a dataset that you can’t immediately see how to reshape with <code>pivot_longer()</code> and <code>pivot_wider()</code>. To gain more control over pivotting, you can instead create a “spec” data frame that describes exactly how data stored in the column names becomes variables (and vice versa). This section introduces you to the spec data structure, and show you how to use it when <code>pivot_longer()</code> and <code>pivot_wider()</code> are insufficient.</p> <div id="longer-1" class="section level2"> <h2>Longer</h2> <p>To see how this works, lets return to the simplest case of pivotting applied to the <code>relig_income</code> dataset. Now pivotting happens in two steps: we first create a spec object (using <code>build_longer_spec()</code>) then use that to describe the pivotting operation:</p> <div class="sourceCode" id="cb42"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1"></a>spec <-<span class="st"> </span>relig_income <span class="op">%>%</span><span class="st"> </span><span class="kw">build_longer_spec</span>(</span> <span id="cb42-2"><a href="#cb42-2"></a> <span class="dt">cols =</span> <span class="op">!</span>religion, </span> <span id="cb42-3"><a href="#cb42-3"></a> <span class="dt">names_to =</span> <span class="st">"income"</span>,</span> <span id="cb42-4"><a href="#cb42-4"></a> <span class="dt">values_to =</span> <span class="st">"count"</span></span> <span id="cb42-5"><a href="#cb42-5"></a>)</span> <span id="cb42-6"><a href="#cb42-6"></a><span class="kw">pivot_longer_spec</span>(relig_income, spec)</span> <span id="cb42-7"><a href="#cb42-7"></a><span class="co">#> # A tibble: 180 x 3</span></span> <span id="cb42-8"><a href="#cb42-8"></a><span class="co">#> religion income count</span></span> <span id="cb42-9"><a href="#cb42-9"></a><span class="co">#> <chr> <chr> <dbl></span></span> <span id="cb42-10"><a href="#cb42-10"></a><span class="co">#> 1 Agnostic <$10k 27</span></span> <span id="cb42-11"><a href="#cb42-11"></a><span class="co">#> 2 Agnostic $10-20k 34</span></span> <span id="cb42-12"><a href="#cb42-12"></a><span class="co">#> 3 Agnostic $20-30k 60</span></span> <span id="cb42-13"><a href="#cb42-13"></a><span class="co">#> 4 Agnostic $30-40k 81</span></span> <span id="cb42-14"><a href="#cb42-14"></a><span class="co">#> 5 Agnostic $40-50k 76</span></span> <span id="cb42-15"><a href="#cb42-15"></a><span class="co">#> 6 Agnostic $50-75k 137</span></span> <span id="cb42-16"><a href="#cb42-16"></a><span class="co">#> 7 Agnostic $75-100k 122</span></span> <span id="cb42-17"><a href="#cb42-17"></a><span class="co">#> 8 Agnostic $100-150k 109</span></span> <span id="cb42-18"><a href="#cb42-18"></a><span class="co">#> 9 Agnostic >150k 84</span></span> <span id="cb42-19"><a href="#cb42-19"></a><span class="co">#> 10 Agnostic Don't know/refused 96</span></span> <span id="cb42-20"><a href="#cb42-20"></a><span class="co">#> # … with 170 more rows</span></span></code></pre></div> <p>(This gives the same result as before, just with more code. There’s no need to use it here, it is presented as a simple example for using <code>spec</code>.)</p> <p>What does <code>spec</code> look like? It’s a data frame with one row for each column in the wide format version of the data that is not present in the long format, and two special columns that start with <code>.</code>:</p> <ul> <li><code>.name</code> gives the name of the column.</li> <li><code>.value</code> gives the name of the column that the values in the cells will go into.</li> </ul> <p>There is also one column in <code>spec</code> for each column present in the long format of the data that is not present in the wide format of the data. This corresponds to the <code>names_to</code> argument in <code>pivot_longer()</code> and <code>build_longer_spec()</code> and the <code>names_from</code> argument in <code>pivot_wider()</code> and <code>build_wider_spec()</code>. In this example, the income column is a character vector of the names of columns being pivoted.</p> <div class="sourceCode" id="cb43"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1"></a>spec</span> <span id="cb43-2"><a href="#cb43-2"></a><span class="co">#> # A tibble: 10 x 3</span></span> <span id="cb43-3"><a href="#cb43-3"></a><span class="co">#> .name .value income </span></span> <span id="cb43-4"><a href="#cb43-4"></a><span class="co">#> <chr> <chr> <chr> </span></span> <span id="cb43-5"><a href="#cb43-5"></a><span class="co">#> 1 <$10k count <$10k </span></span> <span id="cb43-6"><a href="#cb43-6"></a><span class="co">#> 2 $10-20k count $10-20k </span></span> <span id="cb43-7"><a href="#cb43-7"></a><span class="co">#> 3 $20-30k count $20-30k </span></span> <span id="cb43-8"><a href="#cb43-8"></a><span class="co">#> 4 $30-40k count $30-40k </span></span> <span id="cb43-9"><a href="#cb43-9"></a><span class="co">#> 5 $40-50k count $40-50k </span></span> <span id="cb43-10"><a href="#cb43-10"></a><span class="co">#> 6 $50-75k count $50-75k </span></span> <span id="cb43-11"><a href="#cb43-11"></a><span class="co">#> 7 $75-100k count $75-100k </span></span> <span id="cb43-12"><a href="#cb43-12"></a><span class="co">#> 8 $100-150k count $100-150k </span></span> <span id="cb43-13"><a href="#cb43-13"></a><span class="co">#> 9 >150k count >150k </span></span> <span id="cb43-14"><a href="#cb43-14"></a><span class="co">#> 10 Don't know/refused count Don't know/refused</span></span></code></pre></div> </div> <div id="wider-1" class="section level2"> <h2>Wider</h2> <p>Below we widen <code>us_rent_income</code> with <code>pivot_wider()</code>. The result is ok, but I think it could be improved:</p> <div class="sourceCode" id="cb44"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb44-1"><a href="#cb44-1"></a>us_rent_income <span class="op">%>%</span><span class="st"> </span></span> <span id="cb44-2"><a href="#cb44-2"></a><span class="st"> </span><span class="kw">pivot_wider</span>(<span class="dt">names_from =</span> variable, <span class="dt">values_from =</span> <span class="kw">c</span>(estimate, moe))</span> <span id="cb44-3"><a href="#cb44-3"></a><span class="co">#> # A tibble: 52 x 6</span></span> <span id="cb44-4"><a href="#cb44-4"></a><span class="co">#> GEOID NAME estimate_income estimate_rent moe_income moe_rent</span></span> <span id="cb44-5"><a href="#cb44-5"></a><span class="co">#> <chr> <chr> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb44-6"><a href="#cb44-6"></a><span class="co">#> 1 01 Alabama 24476 747 136 3</span></span> <span id="cb44-7"><a href="#cb44-7"></a><span class="co">#> 2 02 Alaska 32940 1200 508 13</span></span> <span id="cb44-8"><a href="#cb44-8"></a><span class="co">#> 3 04 Arizona 27517 972 148 4</span></span> <span id="cb44-9"><a href="#cb44-9"></a><span class="co">#> 4 05 Arkansas 23789 709 165 5</span></span> <span id="cb44-10"><a href="#cb44-10"></a><span class="co">#> 5 06 California 29454 1358 109 3</span></span> <span id="cb44-11"><a href="#cb44-11"></a><span class="co">#> 6 08 Colorado 32401 1125 109 5</span></span> <span id="cb44-12"><a href="#cb44-12"></a><span class="co">#> 7 09 Connecticut 35326 1123 195 5</span></span> <span id="cb44-13"><a href="#cb44-13"></a><span class="co">#> 8 10 Delaware 31560 1076 247 10</span></span> <span id="cb44-14"><a href="#cb44-14"></a><span class="co">#> 9 11 District of Columbia 43198 1424 681 17</span></span> <span id="cb44-15"><a href="#cb44-15"></a><span class="co">#> 10 12 Florida 25952 1077 70 3</span></span> <span id="cb44-16"><a href="#cb44-16"></a><span class="co">#> # … with 42 more rows</span></span></code></pre></div> <p>I think it would be better to have columns <code>income</code>, <code>rent</code>, <code>income_moe</code>, and <code>rent_moe</code>, which we can achieve with a manual spec. The current spec looks like this:</p> <div class="sourceCode" id="cb45"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1"></a>spec1 <-<span class="st"> </span>us_rent_income <span class="op">%>%</span><span class="st"> </span></span> <span id="cb45-2"><a href="#cb45-2"></a><span class="st"> </span><span class="kw">build_wider_spec</span>(<span class="dt">names_from =</span> variable, <span class="dt">values_from =</span> <span class="kw">c</span>(estimate, moe))</span> <span id="cb45-3"><a href="#cb45-3"></a>spec1</span> <span id="cb45-4"><a href="#cb45-4"></a><span class="co">#> # A tibble: 4 x 3</span></span> <span id="cb45-5"><a href="#cb45-5"></a><span class="co">#> .name .value variable</span></span> <span id="cb45-6"><a href="#cb45-6"></a><span class="co">#> <chr> <chr> <chr> </span></span> <span id="cb45-7"><a href="#cb45-7"></a><span class="co">#> 1 estimate_income estimate income </span></span> <span id="cb45-8"><a href="#cb45-8"></a><span class="co">#> 2 estimate_rent estimate rent </span></span> <span id="cb45-9"><a href="#cb45-9"></a><span class="co">#> 3 moe_income moe income </span></span> <span id="cb45-10"><a href="#cb45-10"></a><span class="co">#> 4 moe_rent moe rent</span></span></code></pre></div> <p>For this case, we mutate <code>spec</code> to carefully construct the column names:</p> <div class="sourceCode" id="cb46"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb46-1"><a href="#cb46-1"></a>spec2 <-<span class="st"> </span>spec1 <span class="op">%>%</span></span> <span id="cb46-2"><a href="#cb46-2"></a><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">.name =</span> <span class="kw">paste0</span>(variable, <span class="kw">ifelse</span>(.value <span class="op">==</span><span class="st"> "moe"</span>, <span class="st">"_moe"</span>, <span class="st">""</span>)))</span> <span id="cb46-3"><a href="#cb46-3"></a>spec2</span> <span id="cb46-4"><a href="#cb46-4"></a><span class="co">#> # A tibble: 4 x 3</span></span> <span id="cb46-5"><a href="#cb46-5"></a><span class="co">#> .name .value variable</span></span> <span id="cb46-6"><a href="#cb46-6"></a><span class="co">#> <chr> <chr> <chr> </span></span> <span id="cb46-7"><a href="#cb46-7"></a><span class="co">#> 1 income estimate income </span></span> <span id="cb46-8"><a href="#cb46-8"></a><span class="co">#> 2 rent estimate rent </span></span> <span id="cb46-9"><a href="#cb46-9"></a><span class="co">#> 3 income_moe moe income </span></span> <span id="cb46-10"><a href="#cb46-10"></a><span class="co">#> 4 rent_moe moe rent</span></span></code></pre></div> <p>Supplying this spec to <code>pivot_wider()</code> gives us the result we’re looking for:</p> <div class="sourceCode" id="cb47"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1"></a><span class="kw">pivot_wider_spec</span>(us_rent_income, spec2)</span> <span id="cb47-2"><a href="#cb47-2"></a><span class="co">#> # A tibble: 52 x 6</span></span> <span id="cb47-3"><a href="#cb47-3"></a><span class="co">#> GEOID NAME income rent income_moe rent_moe</span></span> <span id="cb47-4"><a href="#cb47-4"></a><span class="co">#> <chr> <chr> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb47-5"><a href="#cb47-5"></a><span class="co">#> 1 01 Alabama 24476 747 136 3</span></span> <span id="cb47-6"><a href="#cb47-6"></a><span class="co">#> 2 02 Alaska 32940 1200 508 13</span></span> <span id="cb47-7"><a href="#cb47-7"></a><span class="co">#> 3 04 Arizona 27517 972 148 4</span></span> <span id="cb47-8"><a href="#cb47-8"></a><span class="co">#> 4 05 Arkansas 23789 709 165 5</span></span> <span id="cb47-9"><a href="#cb47-9"></a><span class="co">#> 5 06 California 29454 1358 109 3</span></span> <span id="cb47-10"><a href="#cb47-10"></a><span class="co">#> 6 08 Colorado 32401 1125 109 5</span></span> <span id="cb47-11"><a href="#cb47-11"></a><span class="co">#> 7 09 Connecticut 35326 1123 195 5</span></span> <span id="cb47-12"><a href="#cb47-12"></a><span class="co">#> 8 10 Delaware 31560 1076 247 10</span></span> <span id="cb47-13"><a href="#cb47-13"></a><span class="co">#> 9 11 District of Columbia 43198 1424 681 17</span></span> <span id="cb47-14"><a href="#cb47-14"></a><span class="co">#> 10 12 Florida 25952 1077 70 3</span></span> <span id="cb47-15"><a href="#cb47-15"></a><span class="co">#> # … with 42 more rows</span></span></code></pre></div> </div> <div id="by-hand" class="section level2"> <h2>By hand</h2> <p>Sometimes it’s not possible (or not convenient) to compute the spec, and instead it’s more convenient to construct the spec “by hand”. For example, take this <code>construction</code> data, which is lightly modified from Table 5 “completions” found at <a href="https://www.census.gov/construction/nrc/index.html" class="uri">https://www.census.gov/construction/nrc/index.html</a>:</p> <div class="sourceCode" id="cb48"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1"></a>construction</span> <span id="cb48-2"><a href="#cb48-2"></a><span class="co">#> # A tibble: 9 x 9</span></span> <span id="cb48-3"><a href="#cb48-3"></a><span class="co">#> Year Month `1 unit` `2 to 4 units` `5 units or mor… Northeast Midwest South</span></span> <span id="cb48-4"><a href="#cb48-4"></a><span class="co">#> <dbl> <chr> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb48-5"><a href="#cb48-5"></a><span class="co">#> 1 2018 Janu… 859 NA 348 114 169 596</span></span> <span id="cb48-6"><a href="#cb48-6"></a><span class="co">#> 2 2018 Febr… 882 NA 400 138 160 655</span></span> <span id="cb48-7"><a href="#cb48-7"></a><span class="co">#> 3 2018 March 862 NA 356 150 154 595</span></span> <span id="cb48-8"><a href="#cb48-8"></a><span class="co">#> 4 2018 April 797 NA 447 144 196 613</span></span> <span id="cb48-9"><a href="#cb48-9"></a><span class="co">#> 5 2018 May 875 NA 364 90 169 673</span></span> <span id="cb48-10"><a href="#cb48-10"></a><span class="co">#> 6 2018 June 867 NA 342 76 170 610</span></span> <span id="cb48-11"><a href="#cb48-11"></a><span class="co">#> 7 2018 July 829 NA 360 108 183 594</span></span> <span id="cb48-12"><a href="#cb48-12"></a><span class="co">#> 8 2018 Augu… 939 NA 286 90 205 649</span></span> <span id="cb48-13"><a href="#cb48-13"></a><span class="co">#> 9 2018 Sept… 835 NA 304 117 175 560</span></span> <span id="cb48-14"><a href="#cb48-14"></a><span class="co">#> # … with 1 more variable: West <dbl></span></span></code></pre></div> <p>This sort of data is not uncommon from government agencies: the column names actually belong to different variables, and here we have summaries for number of units (1, 2-4, 5+) and regions of the country (NE, NW, midwest, S, W). We can most easily describe that with a tibble:</p> <div class="sourceCode" id="cb49"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1"></a>spec <-<span class="st"> </span><span class="kw">tribble</span>(</span> <span id="cb49-2"><a href="#cb49-2"></a> <span class="op">~</span>.name, <span class="op">~</span>.value, <span class="op">~</span>units, <span class="op">~</span>region, </span> <span id="cb49-3"><a href="#cb49-3"></a> <span class="st">"1 unit"</span>, <span class="st">"n"</span>, <span class="st">"1"</span>, <span class="ot">NA</span>, </span> <span id="cb49-4"><a href="#cb49-4"></a> <span class="st">"2 to 4 units"</span>, <span class="st">"n"</span>, <span class="st">"2-4"</span>, <span class="ot">NA</span>, </span> <span id="cb49-5"><a href="#cb49-5"></a> <span class="st">"5 units or more"</span>, <span class="st">"n"</span>, <span class="st">"5+"</span>, <span class="ot">NA</span>, </span> <span id="cb49-6"><a href="#cb49-6"></a> <span class="st">"Northeast"</span>, <span class="st">"n"</span>, <span class="ot">NA</span>, <span class="st">"Northeast"</span>, </span> <span id="cb49-7"><a href="#cb49-7"></a> <span class="st">"Midwest"</span>, <span class="st">"n"</span>, <span class="ot">NA</span>, <span class="st">"Midwest"</span>, </span> <span id="cb49-8"><a href="#cb49-8"></a> <span class="st">"South"</span>, <span class="st">"n"</span>, <span class="ot">NA</span>, <span class="st">"South"</span>, </span> <span id="cb49-9"><a href="#cb49-9"></a> <span class="st">"West"</span>, <span class="st">"n"</span>, <span class="ot">NA</span>, <span class="st">"West"</span>, </span> <span id="cb49-10"><a href="#cb49-10"></a>)</span></code></pre></div> <p>Which yields the following longer form:</p> <div class="sourceCode" id="cb50"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb50-1"><a href="#cb50-1"></a><span class="kw">pivot_longer_spec</span>(construction, spec)</span> <span id="cb50-2"><a href="#cb50-2"></a><span class="co">#> # A tibble: 63 x 5</span></span> <span id="cb50-3"><a href="#cb50-3"></a><span class="co">#> Year Month units region n</span></span> <span id="cb50-4"><a href="#cb50-4"></a><span class="co">#> <dbl> <chr> <chr> <chr> <dbl></span></span> <span id="cb50-5"><a href="#cb50-5"></a><span class="co">#> 1 2018 January 1 <NA> 859</span></span> <span id="cb50-6"><a href="#cb50-6"></a><span class="co">#> 2 2018 January 2-4 <NA> NA</span></span> <span id="cb50-7"><a href="#cb50-7"></a><span class="co">#> 3 2018 January 5+ <NA> 348</span></span> <span id="cb50-8"><a href="#cb50-8"></a><span class="co">#> 4 2018 January <NA> Northeast 114</span></span> <span id="cb50-9"><a href="#cb50-9"></a><span class="co">#> 5 2018 January <NA> Midwest 169</span></span> <span id="cb50-10"><a href="#cb50-10"></a><span class="co">#> 6 2018 January <NA> South 596</span></span> <span id="cb50-11"><a href="#cb50-11"></a><span class="co">#> 7 2018 January <NA> West 339</span></span> <span id="cb50-12"><a href="#cb50-12"></a><span class="co">#> 8 2018 February 1 <NA> 882</span></span> <span id="cb50-13"><a href="#cb50-13"></a><span class="co">#> 9 2018 February 2-4 <NA> NA</span></span> <span id="cb50-14"><a href="#cb50-14"></a><span class="co">#> 10 2018 February 5+ <NA> 400</span></span> <span id="cb50-15"><a href="#cb50-15"></a><span class="co">#> # … with 53 more rows</span></span></code></pre></div> <p>Note that there is no overlap between the <code>units</code> and <code>region</code> variables; here the data would really be most naturally described in two independent tables.</p> </div> <div id="theory" class="section level2"> <h2>Theory</h2> <p>One neat property of the <code>spec</code> is that you need the same spec for <code>pivot_longer()</code> and <code>pivot_wider()</code>. This makes it very clear that the two operations are symmetric:</p> <div class="sourceCode" id="cb51"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1"></a>construction <span class="op">%>%</span><span class="st"> </span></span> <span id="cb51-2"><a href="#cb51-2"></a><span class="st"> </span><span class="kw">pivot_longer_spec</span>(spec) <span class="op">%>%</span><span class="st"> </span></span> <span id="cb51-3"><a href="#cb51-3"></a><span class="st"> </span><span class="kw">pivot_wider_spec</span>(spec)</span> <span id="cb51-4"><a href="#cb51-4"></a><span class="co">#> # A tibble: 9 x 9</span></span> <span id="cb51-5"><a href="#cb51-5"></a><span class="co">#> Year Month `1 unit` `2 to 4 units` `5 units or mor… Northeast Midwest South</span></span> <span id="cb51-6"><a href="#cb51-6"></a><span class="co">#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl></span></span> <span id="cb51-7"><a href="#cb51-7"></a><span class="co">#> 1 2018 Janu… 859 NA 348 114 169 596</span></span> <span id="cb51-8"><a href="#cb51-8"></a><span class="co">#> 2 2018 Febr… 882 NA 400 138 160 655</span></span> <span id="cb51-9"><a href="#cb51-9"></a><span class="co">#> 3 2018 March 862 NA 356 150 154 595</span></span> <span id="cb51-10"><a href="#cb51-10"></a><span class="co">#> 4 2018 April 797 NA 447 144 196 613</span></span> <span id="cb51-11"><a href="#cb51-11"></a><span class="co">#> 5 2018 May 875 NA 364 90 169 673</span></span> <span id="cb51-12"><a href="#cb51-12"></a><span class="co">#> 6 2018 June 867 NA 342 76 170 610</span></span> <span id="cb51-13"><a href="#cb51-13"></a><span class="co">#> 7 2018 July 829 NA 360 108 183 594</span></span> <span id="cb51-14"><a href="#cb51-14"></a><span class="co">#> 8 2018 Augu… 939 NA 286 90 205 649</span></span> <span id="cb51-15"><a href="#cb51-15"></a><span class="co">#> 9 2018 Sept… 835 NA 304 117 175 560</span></span> <span id="cb51-16"><a href="#cb51-16"></a><span class="co">#> # … with 1 more variable: West <dbl></span></span></code></pre></div> <p>The pivotting spec allows us to be more precise about exactly how <code>pivot_longer(df, spec = spec)</code> changes the shape of <code>df</code>: it will have <code>nrow(df) * nrow(spec)</code> rows, and <code>ncol(df) - nrow(spec) + ncol(spec) - 2</code> columns.</p> </div> </div> <!-- code folding --> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>