EVOLUTION-MANAGER
Edit File: datatable-sd-usage.html
<!DOCTYPE html> <html> <head> <meta charset="utf-8" /> <meta name="generator" content="pandoc" /> <meta http-equiv="X-UA-Compatible" content="IE=EDGE" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="date" content="2022-10-09" /> <title>Using .SD for Data Analysis</title> <script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to // be compatible with the behavior of Pandoc < 2.8). document.addEventListener('DOMContentLoaded', function(e) { var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); var i, h, a; for (i = 0; i < hs.length; i++) { h = hs[i]; if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 a = h.attributes; while (a.length > 0) h.removeAttribute(a[0].name); } }); </script> <script>// Hide empty <a> tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> // v0.0.1 // Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. document.addEventListener('DOMContentLoaded', function() { const codeList = document.getElementsByClassName("sourceCode"); for (var i = 0; i < codeList.length; i++) { var linkList = codeList[i].getElementsByTagName('a'); for (var j = 0; j < linkList.length; j++) { if (linkList[j].innerHTML === "") { linkList[j].setAttribute('aria-hidden', 'true'); } } } }); </script> <style type="text/css"> code{white-space: pre-wrap;} span.smallcaps{font-variant: small-caps;} span.underline{text-decoration: underline;} div.column{display: inline-block; vertical-align: top; width: 50%;} div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} ul.task-list{list-style: none;} </style> <style type="text/css"> code { white-space: pre; } .sourceCode { overflow: visible; } </style> <style type="text/css" data-origin="pandoc"> pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } code.sourceCode > span { color: inherit; text-decoration: inherit; } div.sourceCode { margin: 1em 0; } pre.sourceCode { margin: 0; } @media screen { div.sourceCode { overflow: auto; } } @media print { pre > code.sourceCode { white-space: pre-wrap; } pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } } pre.numberSource code { counter-reset: source-line 0; } pre.numberSource code > span { position: relative; left: -4em; counter-increment: source-line; } pre.numberSource code > span > a:first-child::before { content: counter(source-line); position: relative; left: -1em; text-align: right; vertical-align: baseline; border: none; display: inline-block; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; color: #aaaaaa; } pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } code span.al { color: #ff0000; font-weight: bold; } /* Alert */ code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */ code span.at { color: #7d9029; } /* Attribute */ code span.bn { color: #40a070; } /* BaseN */ code span.bu { } /* BuiltIn */ code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */ code span.ch { color: #4070a0; } /* Char */ code span.cn { color: #880000; } /* Constant */ code span.co { color: #60a0b0; font-style: italic; } /* Comment */ code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */ code span.do { color: #ba2121; font-style: italic; } /* Documentation */ code span.dt { color: #902000; } /* DataType */ code span.dv { color: #40a070; } /* DecVal */ code span.er { color: #ff0000; font-weight: bold; } /* Error */ code span.ex { } /* Extension */ code span.fl { color: #40a070; } /* Float */ code span.fu { color: #06287e; } /* Function */ code span.im { } /* Import */ code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */ code span.kw { color: #007020; font-weight: bold; } /* Keyword */ code span.op { color: #666666; } /* Operator */ code span.ot { color: #007020; } /* Other */ code span.pp { color: #bc7a00; } /* Preprocessor */ code span.sc { color: #4070a0; } /* SpecialChar */ code span.ss { color: #bb6688; } /* SpecialString */ code span.st { color: #4070a0; } /* String */ code span.va { color: #19177c; } /* Variable */ code span.vs { color: #4070a0; } /* VerbatimString */ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */ </style> <script> // apply pandoc div.sourceCode style to pre.sourceCode instead (function() { var sheets = document.styleSheets; for (var i = 0; i < sheets.length; i++) { if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue; try { var rules = sheets[i].cssRules; } catch (e) { continue; } var j = 0; while (j < rules.length) { var rule = rules[j]; // check if there is a div.sourceCode rule if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") { j++; continue; } var style = rule.style.cssText; // check if color or background-color is set if (rule.style.color === '' && rule.style.backgroundColor === '') { j++; continue; } // replace div.sourceCode by a pre.sourceCode rule sheets[i].deleteRule(j); sheets[i].insertRule('pre.sourceCode{' + style + '}', j); } } })(); </script> <style type="text/css">body { background-color: #fff; margin: 1em auto; max-width: 700px; overflow: visible; padding-left: 2em; padding-right: 2em; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; font-size: 14px; line-height: 1.35; } #TOC { clear: both; margin: 0 0 10px 10px; padding: 4px; width: 400px; border: 1px solid #CCCCCC; border-radius: 5px; background-color: #f6f6f6; font-size: 13px; line-height: 1.3; } #TOC .toctitle { font-weight: bold; font-size: 15px; margin-left: 5px; } #TOC ul { padding-left: 40px; margin-left: -1.5em; margin-top: 5px; margin-bottom: 5px; } #TOC ul ul { margin-left: -2em; } #TOC li { line-height: 16px; } table { margin: 1em auto; border-width: 1px; border-color: #DDDDDD; border-style: outset; border-collapse: collapse; } table th { border-width: 2px; padding: 5px; border-style: inset; } table td { border-width: 1px; border-style: inset; line-height: 18px; padding: 5px 5px; } table, table th, table td { border-left-style: none; border-right-style: none; } table thead, table tr.even { background-color: #f7f7f7; } p { margin: 0.5em 0; } blockquote { background-color: #f6f6f6; padding: 0.25em 0.75em; } hr { border-style: solid; border: none; border-top: 1px solid #777; margin: 28px 0; } dl { margin-left: 0; } dl dd { margin-bottom: 13px; margin-left: 13px; } dl dt { font-weight: bold; } ul { margin-top: 0; } ul li { list-style: circle outside; } ul ul { margin-bottom: 0; } pre, code { background-color: #f7f7f7; border-radius: 3px; color: #333; white-space: pre-wrap; } pre { border-radius: 3px; margin: 5px 0px 10px 0px; padding: 10px; } pre:not([class]) { background-color: #f7f7f7; } code { font-family: Consolas, Monaco, 'Courier New', monospace; font-size: 85%; } p > code, li > code { padding: 2px 0px; } div.figure { text-align: center; } img { background-color: #FFFFFF; padding: 2px; border: 1px solid #DDDDDD; border-radius: 3px; border: 1px solid #CCCCCC; margin: 0 5px; } h1 { margin-top: 0; font-size: 35px; line-height: 40px; } h2 { border-bottom: 4px solid #f7f7f7; padding-top: 10px; padding-bottom: 2px; font-size: 145%; } h3 { border-bottom: 2px solid #f7f7f7; padding-top: 10px; font-size: 120%; } h4 { border-bottom: 1px solid #f7f7f7; margin-left: 8px; font-size: 105%; } h5, h6 { border-bottom: 1px solid #ccc; font-size: 105%; } a { color: #0033dd; text-decoration: none; } a:hover { color: #6666ff; } a:visited { color: #800080; } a:visited:hover { color: #BB00BB; } a[href^="http:"] { text-decoration: underline; } a[href^="https:"] { text-decoration: underline; } code > span.kw { color: #555; font-weight: bold; } code > span.dt { color: #902000; } code > span.dv { color: #40a070; } code > span.bn { color: #d14; } code > span.fl { color: #d14; } code > span.ch { color: #d14; } code > span.st { color: #d14; } code > span.co { color: #888888; font-style: italic; } code > span.ot { color: #007020; } code > span.al { color: #ff0000; font-weight: bold; } code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } </style> </head> <body> <h1 class="title toc-ignore">Using .SD for Data Analysis</h1> <h4 class="date">2022-10-09</h4> <div id="TOC"> <ul> <li><a href="#what-is-.sd"><span class="toc-section-number">1</span> What is <code>.SD</code>?</a> <ul> <li><a href="#loading-and-previewing-lahman-data"><span class="toc-section-number">1.1</span> Loading and Previewing Lahman Data</a></li> </ul></li> <li><a href="#sd-on-ungrouped-data"><span class="toc-section-number">2</span> <code>.SD</code> on Ungrouped Data</a> <ul> <li><a href="#column-subsetting-.sdcols"><span class="toc-section-number">2.1</span> Column Subsetting: <code>.SDcols</code></a></li> <li><a href="#column-type-conversion"><span class="toc-section-number">2.2</span> Column Type Conversion</a></li> <li><a href="#controlling-a-models-right-hand-side"><span class="toc-section-number">2.3</span> Controlling a Model’s Right-Hand Side</a></li> <li><a href="#conditional-joins"><span class="toc-section-number">2.4</span> Conditional Joins</a></li> </ul></li> <li><a href="#grouped-.sd-operations"><span class="toc-section-number">3</span> Grouped <code>.SD</code> operations</a> <ul> <li><a href="#group-subsetting"><span class="toc-section-number">3.1</span> Group Subsetting</a></li> <li><a href="#group-optima"><span class="toc-section-number">3.2</span> Group Optima</a></li> <li><a href="#grouped-regression"><span class="toc-section-number">3.3</span> Grouped Regression</a></li> </ul></li> </ul> </div> <p>This vignette will explain the most common ways to use the <code>.SD</code> variable in your <code>data.table</code> analyses. It is an adaptation of <a href="https://stackoverflow.com/a/47406952/3576984">this answer</a> given on StackOverflow.</p> <div id="what-is-.sd" class="section level1" number="1"> <h1 number="1"><span class="header-section-number">1</span> What is <code>.SD</code>?</h1> <p>In the broadest sense, <code>.SD</code> is just shorthand for capturing a variable that comes up frequently in the context of data analysis. It can be understood to stand for <em>S</em>ubset, <em>S</em>elfsame, or <em>S</em>elf-reference of the <em>D</em>ata. That is, <code>.SD</code> is in its most basic guise a <em>reflexive reference</em> to the <code>data.table</code> itself – as we’ll see in examples below, this is particularly helpful for chaining together “queries” (extractions/subsets/etc using <code>[</code>). In particular, this also means that <code>.SD</code> is <em>itself a <code>data.table</code></em> (with the caveat that it does not allow assignment with <code>:=</code>).</p> <p>The simpler usage of <code>.SD</code> is for column subsetting (i.e., when <code>.SDcols</code> is specified); as this version is much more straightforward to understand, we’ll cover that first below. The interpretation of <code>.SD</code> in its second usage, grouping scenarios (i.e., when <code>by =</code> or <code>keyby =</code> is specified), is slightly different, conceptually (though at core it’s the same, since, after all, a non-grouped operation is an edge case of grouping with just one group).</p> <div id="loading-and-previewing-lahman-data" class="section level2" number="1.1"> <h2 number="1.1"><span class="header-section-number">1.1</span> Loading and Previewing Lahman Data</h2> <p>To give this a more real-world feel, rather than making up data, let’s load some data sets about baseball from the <a href="https://www.seanlahman.com/baseball-archive/statistics/">Lahman database</a>. In typical R usage, we’d simply load these data sets from the <code>Lahman</code> R package; in this vignette, we’ve pre-downloaded them directly from the package’s GitHub page instead.</p> <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true"></a><span class="kw">load</span>(<span class="st">'Teams.RData'</span>)</span> <span id="cb1-2"><a href="#cb1-2" aria-hidden="true"></a><span class="kw">setDT</span>(Teams)</span> <span id="cb1-3"><a href="#cb1-3" aria-hidden="true"></a>Teams</span> <span id="cb1-4"><a href="#cb1-4" aria-hidden="true"></a><span class="co"># yearID lgID teamID franchID divID Rank G Ghome W L DivWin WCWin LgWin WSWin R AB H</span></span> <span id="cb1-5"><a href="#cb1-5" aria-hidden="true"></a><span class="co"># 1: 1871 NA BS1 BNA <NA> 3 31 NA 20 10 <NA> <NA> N <NA> 401 1372 426</span></span> <span id="cb1-6"><a href="#cb1-6" aria-hidden="true"></a><span class="co"># 2: 1871 NA CH1 CNA <NA> 2 28 NA 19 9 <NA> <NA> N <NA> 302 1196 323</span></span> <span id="cb1-7"><a href="#cb1-7" aria-hidden="true"></a><span class="co"># 3: 1871 NA CL1 CFC <NA> 8 29 NA 10 19 <NA> <NA> N <NA> 249 1186 328</span></span> <span id="cb1-8"><a href="#cb1-8" aria-hidden="true"></a><span class="co"># 4: 1871 NA FW1 KEK <NA> 7 19 NA 7 12 <NA> <NA> N <NA> 137 746 178</span></span> <span id="cb1-9"><a href="#cb1-9" aria-hidden="true"></a><span class="co"># 5: 1871 NA NY2 NNA <NA> 5 33 NA 16 17 <NA> <NA> N <NA> 302 1404 403</span></span> <span id="cb1-10"><a href="#cb1-10" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb1-11"><a href="#cb1-11" aria-hidden="true"></a><span class="co"># 2891: 2018 NL SLN STL C 3 162 81 88 74 N N N N 759 5498 1369</span></span> <span id="cb1-12"><a href="#cb1-12" aria-hidden="true"></a><span class="co"># 2892: 2018 AL TBA </span><span class="al">TBD</span><span class="co"> E 3 162 81 90 72 N N N N 716 5475 1415</span></span> <span id="cb1-13"><a href="#cb1-13" aria-hidden="true"></a><span class="co"># 2893: 2018 AL TEX TEX W 5 162 81 67 95 N N N N 737 5453 1308</span></span> <span id="cb1-14"><a href="#cb1-14" aria-hidden="true"></a><span class="co"># 2894: 2018 AL TOR TOR E 4 162 81 73 89 N N N N 709 5477 1336</span></span> <span id="cb1-15"><a href="#cb1-15" aria-hidden="true"></a><span class="co"># 2895: 2018 NL WAS WSN E 2 162 81 82 80 N N N N 771 5517 1402</span></span> <span id="cb1-16"><a href="#cb1-16" aria-hidden="true"></a><span class="co"># X2B X3B HR BB SO SB CS HBP SF RA ER ERA CG SHO SV IPouts HA HRA BBA SOA E DP</span></span> <span id="cb1-17"><a href="#cb1-17" aria-hidden="true"></a><span class="co"># 1: 70 37 3 60 19 73 16 NA NA 303 109 3.55 22 1 3 828 367 2 42 23 243 24</span></span> <span id="cb1-18"><a href="#cb1-18" aria-hidden="true"></a><span class="co"># 2: 52 21 10 60 22 69 21 NA NA 241 77 2.76 25 0 1 753 308 6 28 22 229 16</span></span> <span id="cb1-19"><a href="#cb1-19" aria-hidden="true"></a><span class="co"># 3: 35 40 7 26 25 18 8 NA NA 341 116 4.11 23 0 0 762 346 13 53 34 234 15</span></span> <span id="cb1-20"><a href="#cb1-20" aria-hidden="true"></a><span class="co"># 4: 19 8 2 33 9 16 4 NA NA 243 97 5.17 19 1 0 507 261 5 21 17 163 8</span></span> <span id="cb1-21"><a href="#cb1-21" aria-hidden="true"></a><span class="co"># 5: 43 21 1 33 15 46 15 NA NA 313 121 3.72 32 1 0 879 373 7 42 22 235 14</span></span> <span id="cb1-22"><a href="#cb1-22" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb1-23"><a href="#cb1-23" aria-hidden="true"></a><span class="co"># 2891: 248 9 205 525 1380 63 32 80 48 691 622 3.85 1 8 43 4366 1354 144 593 1337 133 151</span></span> <span id="cb1-24"><a href="#cb1-24" aria-hidden="true"></a><span class="co"># 2892: 274 43 150 540 1388 128 51 101 50 646 602 3.74 0 14 52 4345 1236 164 501 1421 85 136</span></span> <span id="cb1-25"><a href="#cb1-25" aria-hidden="true"></a><span class="co"># 2893: 266 24 194 555 1484 74 35 88 34 848 783 4.92 1 5 42 4293 1516 222 491 1121 120 168</span></span> <span id="cb1-26"><a href="#cb1-26" aria-hidden="true"></a><span class="co"># 2894: 320 16 217 499 1387 47 30 58 37 832 772 4.85 0 3 39 4301 1476 208 551 1298 101 138</span></span> <span id="cb1-27"><a href="#cb1-27" aria-hidden="true"></a><span class="co"># 2895: 284 25 191 631 1289 119 33 59 40 682 649 4.04 2 7 40 4338 1320 198 487 1417 64 115</span></span> <span id="cb1-28"><a href="#cb1-28" aria-hidden="true"></a><span class="co"># FP name park attendance BPF PPF teamIDBR</span></span> <span id="cb1-29"><a href="#cb1-29" aria-hidden="true"></a><span class="co"># 1: 0.834 Boston Red Stockings South End Grounds I NA 103 98 BOS</span></span> <span id="cb1-30"><a href="#cb1-30" aria-hidden="true"></a><span class="co"># 2: 0.829 Chicago White Stockings Union Base-Ball Grounds NA 104 102 CHI</span></span> <span id="cb1-31"><a href="#cb1-31" aria-hidden="true"></a><span class="co"># 3: 0.818 Cleveland Forest Citys National Association Grounds NA 96 100 CLE</span></span> <span id="cb1-32"><a href="#cb1-32" aria-hidden="true"></a><span class="co"># 4: 0.803 Fort Wayne Kekiongas Hamilton Field NA 101 107 KEK</span></span> <span id="cb1-33"><a href="#cb1-33" aria-hidden="true"></a><span class="co"># 5: 0.840 New York Mutuals Union Grounds (Brooklyn) NA 90 88 NYU</span></span> <span id="cb1-34"><a href="#cb1-34" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb1-35"><a href="#cb1-35" aria-hidden="true"></a><span class="co"># 2891: 0.978 St. Louis Cardinals Busch Stadium III 3403587 97 96 STL</span></span> <span id="cb1-36"><a href="#cb1-36" aria-hidden="true"></a><span class="co"># 2892: 0.986 Tampa Bay Rays Tropicana Field 1154973 97 97 TBR</span></span> <span id="cb1-37"><a href="#cb1-37" aria-hidden="true"></a><span class="co"># 2893: 0.980 Texas Rangers Rangers Ballpark in Arlington 2107107 112 113 TEX</span></span> <span id="cb1-38"><a href="#cb1-38" aria-hidden="true"></a><span class="co"># 2894: 0.983 Toronto Blue Jays Rogers Centre 2325281 97 98 TOR</span></span> <span id="cb1-39"><a href="#cb1-39" aria-hidden="true"></a><span class="co"># 2895: 0.989 Washington Nationals Nationals Park 2529604 106 105 WSN</span></span> <span id="cb1-40"><a href="#cb1-40" aria-hidden="true"></a><span class="co"># teamIDlahman45 teamIDretro</span></span> <span id="cb1-41"><a href="#cb1-41" aria-hidden="true"></a><span class="co"># 1: BS1 BS1</span></span> <span id="cb1-42"><a href="#cb1-42" aria-hidden="true"></a><span class="co"># 2: CH1 CH1</span></span> <span id="cb1-43"><a href="#cb1-43" aria-hidden="true"></a><span class="co"># 3: CL1 CL1</span></span> <span id="cb1-44"><a href="#cb1-44" aria-hidden="true"></a><span class="co"># 4: FW1 FW1</span></span> <span id="cb1-45"><a href="#cb1-45" aria-hidden="true"></a><span class="co"># 5: NY2 NY2</span></span> <span id="cb1-46"><a href="#cb1-46" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb1-47"><a href="#cb1-47" aria-hidden="true"></a><span class="co"># 2891: SLN SLN</span></span> <span id="cb1-48"><a href="#cb1-48" aria-hidden="true"></a><span class="co"># 2892: TBA TBA</span></span> <span id="cb1-49"><a href="#cb1-49" aria-hidden="true"></a><span class="co"># 2893: TEX TEX</span></span> <span id="cb1-50"><a href="#cb1-50" aria-hidden="true"></a><span class="co"># 2894: TOR TOR</span></span> <span id="cb1-51"><a href="#cb1-51" aria-hidden="true"></a><span class="co"># 2895: MON WAS</span></span> <span id="cb1-52"><a href="#cb1-52" aria-hidden="true"></a></span> <span id="cb1-53"><a href="#cb1-53" aria-hidden="true"></a><span class="kw">load</span>(<span class="st">'Pitching.RData'</span>)</span> <span id="cb1-54"><a href="#cb1-54" aria-hidden="true"></a><span class="kw">setDT</span>(Pitching)</span> <span id="cb1-55"><a href="#cb1-55" aria-hidden="true"></a>Pitching</span> <span id="cb1-56"><a href="#cb1-56" aria-hidden="true"></a><span class="co"># playerID yearID stint teamID lgID W L G GS CG SHO SV IPouts H ER HR BB SO BAOpp</span></span> <span id="cb1-57"><a href="#cb1-57" aria-hidden="true"></a><span class="co"># 1: bechtge01 1871 1 PH1 NA 1 2 3 3 2 0 0 78 43 23 0 11 1 NA</span></span> <span id="cb1-58"><a href="#cb1-58" aria-hidden="true"></a><span class="co"># 2: brainas01 1871 1 WS3 NA 12 15 30 30 30 0 0 792 361 132 4 37 13 NA</span></span> <span id="cb1-59"><a href="#cb1-59" aria-hidden="true"></a><span class="co"># 3: fergubo01 1871 1 NY2 NA 0 0 1 0 0 0 0 3 8 3 0 0 0 NA</span></span> <span id="cb1-60"><a href="#cb1-60" aria-hidden="true"></a><span class="co"># 4: fishech01 1871 1 RC1 NA 4 16 24 24 22 1 0 639 295 103 3 31 15 NA</span></span> <span id="cb1-61"><a href="#cb1-61" aria-hidden="true"></a><span class="co"># 5: fleetfr01 1871 1 NY2 NA 0 1 1 1 1 0 0 27 20 10 0 3 0 NA</span></span> <span id="cb1-62"><a href="#cb1-62" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb1-63"><a href="#cb1-63" aria-hidden="true"></a><span class="co"># 46695: zamorda01 2018 1 NYN NL 1 0 16 0 0 0 0 27 6 3 1 3 16 0.194</span></span> <span id="cb1-64"><a href="#cb1-64" aria-hidden="true"></a><span class="co"># 46696: zastrro01 2018 1 CHN NL 1 0 6 0 0 0 0 17 6 3 0 4 3 0.286</span></span> <span id="cb1-65"><a href="#cb1-65" aria-hidden="true"></a><span class="co"># 46697: zieglbr01 2018 1 MIA NL 1 5 53 0 0 0 10 156 49 23 7 17 37 0.254</span></span> <span id="cb1-66"><a href="#cb1-66" aria-hidden="true"></a><span class="co"># 46698: zieglbr01 2018 2 ARI NL 1 1 29 0 0 0 0 65 22 9 1 8 13 0.265</span></span> <span id="cb1-67"><a href="#cb1-67" aria-hidden="true"></a><span class="co"># 46699: zimmejo02 2018 1 DET AL 7 8 25 25 0 0 0 394 140 66 28 26 111 0.269</span></span> <span id="cb1-68"><a href="#cb1-68" aria-hidden="true"></a><span class="co"># ERA IBB WP HBP BK BFP GF R SH SF GIDP</span></span> <span id="cb1-69"><a href="#cb1-69" aria-hidden="true"></a><span class="co"># 1: 7.96 NA 7 NA 0 146 0 42 NA NA NA</span></span> <span id="cb1-70"><a href="#cb1-70" aria-hidden="true"></a><span class="co"># 2: 4.50 NA 7 NA 0 1291 0 292 NA NA NA</span></span> <span id="cb1-71"><a href="#cb1-71" aria-hidden="true"></a><span class="co"># 3: 27.00 NA 2 NA 0 14 0 9 NA NA NA</span></span> <span id="cb1-72"><a href="#cb1-72" aria-hidden="true"></a><span class="co"># 4: 4.35 NA 20 NA 0 1080 1 257 NA NA NA</span></span> <span id="cb1-73"><a href="#cb1-73" aria-hidden="true"></a><span class="co"># 5: 10.00 NA 0 NA 0 57 0 21 NA NA NA</span></span> <span id="cb1-74"><a href="#cb1-74" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb1-75"><a href="#cb1-75" aria-hidden="true"></a><span class="co"># 46695: 3.00 1 0 1 0 36 4 3 1 0 1</span></span> <span id="cb1-76"><a href="#cb1-76" aria-hidden="true"></a><span class="co"># 46696: 4.76 0 0 1 0 26 2 3 0 0 0</span></span> <span id="cb1-77"><a href="#cb1-77" aria-hidden="true"></a><span class="co"># 46697: 3.98 4 1 2 0 213 23 25 0 1 11</span></span> <span id="cb1-78"><a href="#cb1-78" aria-hidden="true"></a><span class="co"># 46698: 3.74 2 0 0 0 92 1 9 0 1 3</span></span> <span id="cb1-79"><a href="#cb1-79" aria-hidden="true"></a><span class="co"># 46699: 4.52 0 1 2 0 556 0 76 2 5 4</span></span></code></pre></div> <p>Readers up on baseball lingo should find the tables’ contents familiar; <code>Teams</code> records some statistics for a given team in a given year, while <code>Pitching</code> records statistics for a given pitcher in a given year. Please do check out the <a href="https://www.seanlahman.com/files/database/readme2017.txt">documentation</a> and explore the data yourself a bit before proceeding to familiarize yourself with their structure.</p> </div> </div> <div id="sd-on-ungrouped-data" class="section level1" number="2"> <h1 number="2"><span class="header-section-number">2</span> <code>.SD</code> on Ungrouped Data</h1> <p>To illustrate what I mean about the reflexive nature of <code>.SD</code>, consider its most banal usage:</p> <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true"></a>Pitching[ , .SD]</span> <span id="cb2-2"><a href="#cb2-2" aria-hidden="true"></a><span class="co"># playerID yearID stint teamID lgID W L G GS CG SHO SV IPouts H ER HR BB SO BAOpp</span></span> <span id="cb2-3"><a href="#cb2-3" aria-hidden="true"></a><span class="co"># 1: bechtge01 1871 1 PH1 NA 1 2 3 3 2 0 0 78 43 23 0 11 1 NA</span></span> <span id="cb2-4"><a href="#cb2-4" aria-hidden="true"></a><span class="co"># 2: brainas01 1871 1 WS3 NA 12 15 30 30 30 0 0 792 361 132 4 37 13 NA</span></span> <span id="cb2-5"><a href="#cb2-5" aria-hidden="true"></a><span class="co"># 3: fergubo01 1871 1 NY2 NA 0 0 1 0 0 0 0 3 8 3 0 0 0 NA</span></span> <span id="cb2-6"><a href="#cb2-6" aria-hidden="true"></a><span class="co"># 4: fishech01 1871 1 RC1 NA 4 16 24 24 22 1 0 639 295 103 3 31 15 NA</span></span> <span id="cb2-7"><a href="#cb2-7" aria-hidden="true"></a><span class="co"># 5: fleetfr01 1871 1 NY2 NA 0 1 1 1 1 0 0 27 20 10 0 3 0 NA</span></span> <span id="cb2-8"><a href="#cb2-8" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb2-9"><a href="#cb2-9" aria-hidden="true"></a><span class="co"># 46695: zamorda01 2018 1 NYN NL 1 0 16 0 0 0 0 27 6 3 1 3 16 0.194</span></span> <span id="cb2-10"><a href="#cb2-10" aria-hidden="true"></a><span class="co"># 46696: zastrro01 2018 1 CHN NL 1 0 6 0 0 0 0 17 6 3 0 4 3 0.286</span></span> <span id="cb2-11"><a href="#cb2-11" aria-hidden="true"></a><span class="co"># 46697: zieglbr01 2018 1 MIA NL 1 5 53 0 0 0 10 156 49 23 7 17 37 0.254</span></span> <span id="cb2-12"><a href="#cb2-12" aria-hidden="true"></a><span class="co"># 46698: zieglbr01 2018 2 ARI NL 1 1 29 0 0 0 0 65 22 9 1 8 13 0.265</span></span> <span id="cb2-13"><a href="#cb2-13" aria-hidden="true"></a><span class="co"># 46699: zimmejo02 2018 1 DET AL 7 8 25 25 0 0 0 394 140 66 28 26 111 0.269</span></span> <span id="cb2-14"><a href="#cb2-14" aria-hidden="true"></a><span class="co"># ERA IBB WP HBP BK BFP GF R SH SF GIDP</span></span> <span id="cb2-15"><a href="#cb2-15" aria-hidden="true"></a><span class="co"># 1: 7.96 NA 7 NA 0 146 0 42 NA NA NA</span></span> <span id="cb2-16"><a href="#cb2-16" aria-hidden="true"></a><span class="co"># 2: 4.50 NA 7 NA 0 1291 0 292 NA NA NA</span></span> <span id="cb2-17"><a href="#cb2-17" aria-hidden="true"></a><span class="co"># 3: 27.00 NA 2 NA 0 14 0 9 NA NA NA</span></span> <span id="cb2-18"><a href="#cb2-18" aria-hidden="true"></a><span class="co"># 4: 4.35 NA 20 NA 0 1080 1 257 NA NA NA</span></span> <span id="cb2-19"><a href="#cb2-19" aria-hidden="true"></a><span class="co"># 5: 10.00 NA 0 NA 0 57 0 21 NA NA NA</span></span> <span id="cb2-20"><a href="#cb2-20" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb2-21"><a href="#cb2-21" aria-hidden="true"></a><span class="co"># 46695: 3.00 1 0 1 0 36 4 3 1 0 1</span></span> <span id="cb2-22"><a href="#cb2-22" aria-hidden="true"></a><span class="co"># 46696: 4.76 0 0 1 0 26 2 3 0 0 0</span></span> <span id="cb2-23"><a href="#cb2-23" aria-hidden="true"></a><span class="co"># 46697: 3.98 4 1 2 0 213 23 25 0 1 11</span></span> <span id="cb2-24"><a href="#cb2-24" aria-hidden="true"></a><span class="co"># 46698: 3.74 2 0 0 0 92 1 9 0 1 3</span></span> <span id="cb2-25"><a href="#cb2-25" aria-hidden="true"></a><span class="co"># 46699: 4.52 0 1 2 0 556 0 76 2 5 4</span></span></code></pre></div> <p>That is, <code>Pitching[ , .SD]</code> has simply returned the whole table, i.e., this was an overly verbose way of writing <code>Pitching</code> or <code>Pitching[]</code>:</p> <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true"></a><span class="kw">identical</span>(Pitching, Pitching[ , .SD])</span> <span id="cb3-2"><a href="#cb3-2" aria-hidden="true"></a><span class="co"># [1] TRUE</span></span></code></pre></div> <p>In terms of subsetting, <code>.SD</code> is still a subset of the data, it’s just a trivial one (the set itself).</p> <div id="column-subsetting-.sdcols" class="section level2" number="2.1"> <h2 number="2.1"><span class="header-section-number">2.1</span> Column Subsetting: <code>.SDcols</code></h2> <p>The first way to impact what <code>.SD</code> is is to limit the <em>columns</em> contained in <code>.SD</code> using the <code>.SDcols</code> argument to <code>[</code>:</p> <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true"></a><span class="co"># W: Wins; L: Losses; G: Games</span></span> <span id="cb4-2"><a href="#cb4-2" aria-hidden="true"></a>Pitching[ , .SD, .SDcols =<span class="st"> </span><span class="kw">c</span>(<span class="st">'W'</span>, <span class="st">'L'</span>, <span class="st">'G'</span>)]</span> <span id="cb4-3"><a href="#cb4-3" aria-hidden="true"></a><span class="co"># W L G</span></span> <span id="cb4-4"><a href="#cb4-4" aria-hidden="true"></a><span class="co"># 1: 1 2 3</span></span> <span id="cb4-5"><a href="#cb4-5" aria-hidden="true"></a><span class="co"># 2: 12 15 30</span></span> <span id="cb4-6"><a href="#cb4-6" aria-hidden="true"></a><span class="co"># 3: 0 0 1</span></span> <span id="cb4-7"><a href="#cb4-7" aria-hidden="true"></a><span class="co"># 4: 4 16 24</span></span> <span id="cb4-8"><a href="#cb4-8" aria-hidden="true"></a><span class="co"># 5: 0 1 1</span></span> <span id="cb4-9"><a href="#cb4-9" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb4-10"><a href="#cb4-10" aria-hidden="true"></a><span class="co"># 46695: 1 0 16</span></span> <span id="cb4-11"><a href="#cb4-11" aria-hidden="true"></a><span class="co"># 46696: 1 0 6</span></span> <span id="cb4-12"><a href="#cb4-12" aria-hidden="true"></a><span class="co"># 46697: 1 5 53</span></span> <span id="cb4-13"><a href="#cb4-13" aria-hidden="true"></a><span class="co"># 46698: 1 1 29</span></span> <span id="cb4-14"><a href="#cb4-14" aria-hidden="true"></a><span class="co"># 46699: 7 8 25</span></span></code></pre></div> <p>This is just for illustration and was pretty boring. But even this simply usage lends itself to a wide variety of highly beneficial / ubiquitous data manipulation operations:</p> </div> <div id="column-type-conversion" class="section level2" number="2.2"> <h2 number="2.2"><span class="header-section-number">2.2</span> Column Type Conversion</h2> <p>Column type conversion is a fact of life for data munging. Though <a href="https://github.com/Rdatatable/data.table/pull/2545"><code>fwrite</code> recently gained the ability to declare the class of each column up front</a>, not all data sets come from <code>fread</code> (e.g. in this vignette) and conversions back and forth among <code>character</code>/<code>factor</code>/<code>numeric</code> types are common. We can use <code>.SD</code> and <code>.SDcols</code> to batch-convert groups of columns to a common type.</p> <p>We notice that the following columns are stored as <code>character</code> in the <code>Teams</code> data set, but might more logically be stored as <code>factor</code>s:</p> <div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true"></a><span class="co"># teamIDBR: Team ID used by Baseball Reference website</span></span> <span id="cb5-2"><a href="#cb5-2" aria-hidden="true"></a><span class="co"># teamIDlahman45: Team ID used in Lahman database version 4.5</span></span> <span id="cb5-3"><a href="#cb5-3" aria-hidden="true"></a><span class="co"># teamIDretro: Team ID used by Retrosheet</span></span> <span id="cb5-4"><a href="#cb5-4" aria-hidden="true"></a>fkt =<span class="st"> </span><span class="kw">c</span>(<span class="st">'teamIDBR'</span>, <span class="st">'teamIDlahman45'</span>, <span class="st">'teamIDretro'</span>)</span> <span id="cb5-5"><a href="#cb5-5" aria-hidden="true"></a><span class="co"># confirm that they're stored as `character`</span></span> <span id="cb5-6"><a href="#cb5-6" aria-hidden="true"></a>Teams[ , <span class="kw">sapply</span>(.SD, is.character), .SDcols =<span class="st"> </span>fkt]</span> <span id="cb5-7"><a href="#cb5-7" aria-hidden="true"></a><span class="co"># teamIDBR teamIDlahman45 teamIDretro </span></span> <span id="cb5-8"><a href="#cb5-8" aria-hidden="true"></a><span class="co"># TRUE TRUE TRUE</span></span></code></pre></div> <p>If you’re confused by the use of <code>sapply</code> here, note that it’s quite similar for base R <code>data.frames</code>:</p> <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true"></a><span class="kw">setDF</span>(Teams) <span class="co"># convert to data.frame for illustration</span></span> <span id="cb6-2"><a href="#cb6-2" aria-hidden="true"></a><span class="kw">sapply</span>(Teams[ , fkt], is.character)</span> <span id="cb6-3"><a href="#cb6-3" aria-hidden="true"></a><span class="co"># teamIDBR teamIDlahman45 teamIDretro </span></span> <span id="cb6-4"><a href="#cb6-4" aria-hidden="true"></a><span class="co"># TRUE TRUE TRUE</span></span> <span id="cb6-5"><a href="#cb6-5" aria-hidden="true"></a><span class="kw">setDT</span>(Teams) <span class="co"># convert back to data.table</span></span></code></pre></div> <p>The key to understanding this syntax is to recall that a <code>data.table</code> (as well as a <code>data.frame</code>) can be considered as a <code>list</code> where each element is a column – thus, <code>sapply</code>/<code>lapply</code> applies the <code>FUN</code> argument (in this case, <code>is.character</code>) to each <em>column</em> and returns the result as <code>sapply</code>/<code>lapply</code> usually would.</p> <p>The syntax to now convert these columns to <code>factor</code> is very similar – simply add the <code>:=</code> assignment operator:</p> <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true"></a>Teams[ , (fkt) <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">lapply</span>(.SD, factor), .SDcols =<span class="st"> </span>fkt]</span> <span id="cb7-2"><a href="#cb7-2" aria-hidden="true"></a><span class="co"># print out the first column to demonstrate success</span></span> <span id="cb7-3"><a href="#cb7-3" aria-hidden="true"></a><span class="kw">head</span>(<span class="kw">unique</span>(Teams[[fkt[1L]]]))</span> <span id="cb7-4"><a href="#cb7-4" aria-hidden="true"></a><span class="co"># [1] BOS CHI CLE KEK NYU ATH</span></span> <span id="cb7-5"><a href="#cb7-5" aria-hidden="true"></a><span class="co"># 101 Levels: ALT ANA ARI ATH ATL BAL BLA BLN BLU BOS BRA BRG BRO BSN BTT BUF BWW CAL CEN CHC ... WSN</span></span></code></pre></div> <p>Note that we must wrap <code>fkt</code> in parentheses <code>()</code> to force <code>data.table</code> to interpret this as column names, instead of trying to assign a column named <code>'fkt'</code>.</p> <p>Actually, the <code>.SDcols</code> argument is quite flexible; above, we supplied a <code>character</code> vector of column names. In other situations, it is more convenient to supply an <code>integer</code> vector of column <em>positions</em> or a <code>logical</code> vector dictating include/exclude for each column. <code>.SDcols</code> even accepts regular expression-based pattern matching.</p> <p>For example, we could do the following to convert all <code>factor</code> columns to <code>character</code>:</p> <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true"></a><span class="co"># while .SDcols accepts a logical vector,</span></span> <span id="cb8-2"><a href="#cb8-2" aria-hidden="true"></a><span class="co"># := does not, so we need to convert to column</span></span> <span id="cb8-3"><a href="#cb8-3" aria-hidden="true"></a><span class="co"># positions with which()</span></span> <span id="cb8-4"><a href="#cb8-4" aria-hidden="true"></a>fkt_idx =<span class="st"> </span><span class="kw">which</span>(<span class="kw">sapply</span>(Teams, is.factor))</span> <span id="cb8-5"><a href="#cb8-5" aria-hidden="true"></a>Teams[ , (fkt_idx) <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">lapply</span>(.SD, as.character), .SDcols =<span class="st"> </span>fkt_idx]</span> <span id="cb8-6"><a href="#cb8-6" aria-hidden="true"></a><span class="kw">head</span>(<span class="kw">unique</span>(Teams[[fkt_idx[1L]]]))</span> <span id="cb8-7"><a href="#cb8-7" aria-hidden="true"></a><span class="co"># [1] "NA" "NL" "AA" "UA" "PL" "AL"</span></span></code></pre></div> <p>Lastly, we can do pattern-based matching of columns in <code>.SDcols</code> to select all columns which contain <code>team</code> back to <code>factor</code>:</p> <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true"></a>Teams[ , .SD, .SDcols =<span class="st"> </span><span class="kw">patterns</span>(<span class="st">'team'</span>)]</span> <span id="cb9-2"><a href="#cb9-2" aria-hidden="true"></a><span class="co"># teamID teamIDBR teamIDlahman45 teamIDretro</span></span> <span id="cb9-3"><a href="#cb9-3" aria-hidden="true"></a><span class="co"># 1: BS1 BOS BS1 BS1</span></span> <span id="cb9-4"><a href="#cb9-4" aria-hidden="true"></a><span class="co"># 2: CH1 CHI CH1 CH1</span></span> <span id="cb9-5"><a href="#cb9-5" aria-hidden="true"></a><span class="co"># 3: CL1 CLE CL1 CL1</span></span> <span id="cb9-6"><a href="#cb9-6" aria-hidden="true"></a><span class="co"># 4: FW1 KEK FW1 FW1</span></span> <span id="cb9-7"><a href="#cb9-7" aria-hidden="true"></a><span class="co"># 5: NY2 NYU NY2 NY2</span></span> <span id="cb9-8"><a href="#cb9-8" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb9-9"><a href="#cb9-9" aria-hidden="true"></a><span class="co"># 2891: SLN STL SLN SLN</span></span> <span id="cb9-10"><a href="#cb9-10" aria-hidden="true"></a><span class="co"># 2892: TBA TBR TBA TBA</span></span> <span id="cb9-11"><a href="#cb9-11" aria-hidden="true"></a><span class="co"># 2893: TEX TEX TEX TEX</span></span> <span id="cb9-12"><a href="#cb9-12" aria-hidden="true"></a><span class="co"># 2894: TOR TOR TOR TOR</span></span> <span id="cb9-13"><a href="#cb9-13" aria-hidden="true"></a><span class="co"># 2895: WAS WSN MON WAS</span></span> <span id="cb9-14"><a href="#cb9-14" aria-hidden="true"></a></span> <span id="cb9-15"><a href="#cb9-15" aria-hidden="true"></a><span class="co"># now convert these columns to factor;</span></span> <span id="cb9-16"><a href="#cb9-16" aria-hidden="true"></a><span class="co"># value = TRUE in grep() is for the LHS of := to</span></span> <span id="cb9-17"><a href="#cb9-17" aria-hidden="true"></a><span class="co"># get column names instead of positions</span></span> <span id="cb9-18"><a href="#cb9-18" aria-hidden="true"></a>team_idx =<span class="st"> </span><span class="kw">grep</span>(<span class="st">'team'</span>, <span class="kw">names</span>(Teams), <span class="dt">value =</span> <span class="ot">TRUE</span>)</span> <span id="cb9-19"><a href="#cb9-19" aria-hidden="true"></a>Teams[ , (team_idx) <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">lapply</span>(.SD, factor), .SDcols =<span class="st"> </span>team_idx]</span></code></pre></div> <p>** A proviso to the above: <em>explicitly</em> using column numbers (like <code>DT[ , (1) := rnorm(.N)]</code>) is bad practice and can lead to silently corrupted code over time if column positions change. Even implicitly using numbers can be dangerous if we don’t keep smart/strict control over the ordering of when we create the numbered index and when we use it.</p> </div> <div id="controlling-a-models-right-hand-side" class="section level2" number="2.3"> <h2 number="2.3"><span class="header-section-number">2.3</span> Controlling a Model’s Right-Hand Side</h2> <p>Varying model specification is a core feature of robust statistical analysis. Let’s try and predict a pitcher’s ERA (Earned Runs Average, a measure of performance) using the small set of covariates available in the <code>Pitching</code> table. How does the (linear) relationship between <code>W</code> (wins) and <code>ERA</code> vary depending on which other covariates are included in the specification?</p> <p>Here’s a short script leveraging the power of <code>.SD</code> which explores this question:</p> <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true"></a><span class="co"># this generates a list of the 2^k possible extra variables</span></span> <span id="cb10-2"><a href="#cb10-2" aria-hidden="true"></a><span class="co"># for models of the form ERA ~ G + (...)</span></span> <span id="cb10-3"><a href="#cb10-3" aria-hidden="true"></a>extra_var =<span class="st"> </span><span class="kw">c</span>(<span class="st">'yearID'</span>, <span class="st">'teamID'</span>, <span class="st">'G'</span>, <span class="st">'L'</span>)</span> <span id="cb10-4"><a href="#cb10-4" aria-hidden="true"></a>models =<span class="st"> </span><span class="kw">unlist</span>(</span> <span id="cb10-5"><a href="#cb10-5" aria-hidden="true"></a> <span class="kw">lapply</span>(0L<span class="op">:</span><span class="kw">length</span>(extra_var), combn, <span class="dt">x =</span> extra_var, <span class="dt">simplify =</span> <span class="ot">FALSE</span>),</span> <span id="cb10-6"><a href="#cb10-6" aria-hidden="true"></a> <span class="dt">recursive =</span> <span class="ot">FALSE</span></span> <span id="cb10-7"><a href="#cb10-7" aria-hidden="true"></a>)</span> <span id="cb10-8"><a href="#cb10-8" aria-hidden="true"></a></span> <span id="cb10-9"><a href="#cb10-9" aria-hidden="true"></a><span class="co"># here are 16 visually distinct colors, taken from the list of 20 here:</span></span> <span id="cb10-10"><a href="#cb10-10" aria-hidden="true"></a><span class="co"># https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/</span></span> <span id="cb10-11"><a href="#cb10-11" aria-hidden="true"></a>col16 =<span class="st"> </span><span class="kw">c</span>(<span class="st">'#e6194b'</span>, <span class="st">'#3cb44b'</span>, <span class="st">'#ffe119'</span>, <span class="st">'#0082c8'</span>,</span> <span id="cb10-12"><a href="#cb10-12" aria-hidden="true"></a> <span class="st">'#f58231'</span>, <span class="st">'#911eb4'</span>, <span class="st">'#46f0f0'</span>, <span class="st">'#f032e6'</span>,</span> <span id="cb10-13"><a href="#cb10-13" aria-hidden="true"></a> <span class="st">'#d2f53c'</span>, <span class="st">'#fabebe'</span>, <span class="st">'#008080'</span>, <span class="st">'#e6beff'</span>,</span> <span id="cb10-14"><a href="#cb10-14" aria-hidden="true"></a> <span class="st">'#aa6e28'</span>, <span class="st">'#fffac8'</span>, <span class="st">'#800000'</span>, <span class="st">'#aaffc3'</span>)</span> <span id="cb10-15"><a href="#cb10-15" aria-hidden="true"></a></span> <span id="cb10-16"><a href="#cb10-16" aria-hidden="true"></a><span class="kw">par</span>(<span class="dt">oma =</span> <span class="kw">c</span>(<span class="dv">2</span>, <span class="dv">0</span>, <span class="dv">0</span>, <span class="dv">0</span>))</span> <span id="cb10-17"><a href="#cb10-17" aria-hidden="true"></a>lm_coef =<span class="st"> </span><span class="kw">sapply</span>(models, <span class="cf">function</span>(rhs) {</span> <span id="cb10-18"><a href="#cb10-18" aria-hidden="true"></a> <span class="co"># using ERA ~ . and data = .SD, then varying which</span></span> <span id="cb10-19"><a href="#cb10-19" aria-hidden="true"></a> <span class="co"># columns are included in .SD allows us to perform this</span></span> <span id="cb10-20"><a href="#cb10-20" aria-hidden="true"></a> <span class="co"># iteration over 16 models succinctly.</span></span> <span id="cb10-21"><a href="#cb10-21" aria-hidden="true"></a> <span class="co"># coef(.)['W'] extracts the W coefficient from each model fit</span></span> <span id="cb10-22"><a href="#cb10-22" aria-hidden="true"></a> Pitching[ , <span class="kw">coef</span>(<span class="kw">lm</span>(ERA <span class="op">~</span><span class="st"> </span>., <span class="dt">data =</span> .SD))[<span class="st">'W'</span>], .SDcols =<span class="st"> </span><span class="kw">c</span>(<span class="st">'W'</span>, rhs)]</span> <span id="cb10-23"><a href="#cb10-23" aria-hidden="true"></a>})</span> <span id="cb10-24"><a href="#cb10-24" aria-hidden="true"></a><span class="kw">barplot</span>(lm_coef, <span class="dt">names.arg =</span> <span class="kw">sapply</span>(models, paste, <span class="dt">collapse =</span> <span class="st">'/'</span>),</span> <span id="cb10-25"><a href="#cb10-25" aria-hidden="true"></a> <span class="dt">main =</span> <span class="st">'Wins Coefficient</span><span class="ch">\n</span><span class="st">With Various Covariates'</span>,</span> <span id="cb10-26"><a href="#cb10-26" aria-hidden="true"></a> <span class="dt">col =</span> col16, <span class="dt">las =</span> 2L, <span class="dt">cex.names =</span> <span class="fl">.8</span>)</span></code></pre></div> <p><img src="" width="100%" /></p> <p>The coefficient always has the expected sign (better pitchers tend to have more wins and fewer runs allowed), but the magnitude can vary substantially depending on what else we control for.</p> </div> <div id="conditional-joins" class="section level2" number="2.4"> <h2 number="2.4"><span class="header-section-number">2.4</span> Conditional Joins</h2> <p><code>data.table</code> syntax is beautiful for its simplicity and robustness. The syntax <code>x[i]</code> flexibly handles three common approaches to subsetting – when <code>i</code> is a <code>logical</code> vector, <code>x[i]</code> will return those rows of <code>x</code> corresponding to where <code>i</code> is <code>TRUE</code>; when <code>i</code> is <em>another <code>data.table</code></em> (or a <code>list</code>), a (right) <code>join</code> is performed (in the plain form, using the <code>key</code>s of <code>x</code> and <code>i</code>, otherwise, when <code>on =</code> is specified, using matches of those columns); and when <code>i</code> is a character, it is interpreted as shorthand for <code>x[list(i)]</code>, i.e., as a join.</p> <p>This is great in general, but falls short when we wish to perform a <em>conditional join</em>, wherein the exact nature of the relationship among tables depends on some characteristics of the rows in one or more columns.</p> <p>This example is admittedly a tad contrived, but illustrates the idea; see here (<a href="https://stackoverflow.com/questions/31329939/conditional-keyed-join-update-and-update-a-flag-column-for-matches">1</a>, <a href="https://stackoverflow.com/questions/29658627/conditional-binary-join-and-update-by-reference-using-the-data-table-package">2</a>) for more.</p> <p>The goal is to add a column <code>team_performance</code> to the <code>Pitching</code> table that records the team’s performance (rank) of the best pitcher on each team (as measured by the lowest ERA, among pitchers with at least 6 recorded games).</p> <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true"></a><span class="co"># to exclude pitchers with exceptional performance in a few games,</span></span> <span id="cb11-2"><a href="#cb11-2" aria-hidden="true"></a><span class="co"># subset first; then define rank of pitchers within their team each year</span></span> <span id="cb11-3"><a href="#cb11-3" aria-hidden="true"></a><span class="co"># (in general, we should put more care into the 'ties.method' of frank)</span></span> <span id="cb11-4"><a href="#cb11-4" aria-hidden="true"></a>Pitching[G <span class="op">></span><span class="st"> </span><span class="dv">5</span>, rank_in_team <span class="op">:</span><span class="er">=</span><span class="st"> </span><span class="kw">frank</span>(ERA), by =<span class="st"> </span>.(teamID, yearID)]</span> <span id="cb11-5"><a href="#cb11-5" aria-hidden="true"></a>Pitching[rank_in_team <span class="op">==</span><span class="st"> </span><span class="dv">1</span>, team_performance <span class="op">:</span><span class="er">=</span></span> <span id="cb11-6"><a href="#cb11-6" aria-hidden="true"></a><span class="st"> </span>Teams[.SD, Rank, on =<span class="st"> </span><span class="kw">c</span>(<span class="st">'teamID'</span>, <span class="st">'yearID'</span>)]]</span></code></pre></div> <p>Note that the <code>x[y]</code> syntax returns <code>nrow(y)</code> values (i.e., it’s a right join), which is why <code>.SD</code> is on the right in <code>Teams[.SD]</code> (since the RHS of <code>:=</code> in this case requires <code>nrow(Pitching[rank_in_team == 1])</code> values.</p> </div> </div> <div id="grouped-.sd-operations" class="section level1" number="3"> <h1 number="3"><span class="header-section-number">3</span> Grouped <code>.SD</code> operations</h1> <p>Often, we’d like to perform some operation on our data <em>at the group level</em>. When we specify <code>by =</code> (or <code>keyby =</code>), the mental model for what happens when <code>data.table</code> processes <code>j</code> is to think of your <code>data.table</code> as being split into many component sub-<code>data.table</code>s, each of which corresponds to a single value of your <code>by</code> variable(s):</p> <div class="figure"> <img src="" alt="Grouping, Illustrated" width="100%" /> <p class="caption"> Grouping, Illustrated </p> </div> <p>In the case of grouping, <code>.SD</code> is multiple in nature – it refers to <em>each</em> of these sub-<code>data.table</code>s, <em>one-at-a-time</em> (slightly more accurately, the scope of <code>.SD</code> is a single sub-<code>data.table</code>). This allows us to concisely express an operation that we’d like to perform on <em>each sub-<code>data.table</code></em> before the re-assembled result is returned to us.</p> <p>This is useful in a variety of settings, the most common of which are presented here:</p> <div id="group-subsetting" class="section level2" number="3.1"> <h2 number="3.1"><span class="header-section-number">3.1</span> Group Subsetting</h2> <p>Let’s get the most recent season of data for each team in the Lahman data. This can be done quite simply with:</p> <div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true"></a><span class="co"># the data is already sorted by year; if it weren't</span></span> <span id="cb12-2"><a href="#cb12-2" aria-hidden="true"></a><span class="co"># we could do Teams[order(yearID), .SD[.N], by = teamID]</span></span> <span id="cb12-3"><a href="#cb12-3" aria-hidden="true"></a>Teams[ , .SD[.N], by =<span class="st"> </span>teamID]</span> <span id="cb12-4"><a href="#cb12-4" aria-hidden="true"></a><span class="co"># teamID yearID lgID franchID divID Rank G Ghome W L DivWin WCWin LgWin WSWin R AB H</span></span> <span id="cb12-5"><a href="#cb12-5" aria-hidden="true"></a><span class="co"># 1: BS1 1875 NA BNA <NA> 1 82 NA 71 8 <NA> <NA> Y <NA> 831 3515 1128</span></span> <span id="cb12-6"><a href="#cb12-6" aria-hidden="true"></a><span class="co"># 2: CH1 1871 NA CNA <NA> 2 28 NA 19 9 <NA> <NA> N <NA> 302 1196 323</span></span> <span id="cb12-7"><a href="#cb12-7" aria-hidden="true"></a><span class="co"># 3: CL1 1872 NA CFC <NA> 7 22 NA 6 16 <NA> <NA> N <NA> 174 943 272</span></span> <span id="cb12-8"><a href="#cb12-8" aria-hidden="true"></a><span class="co"># 4: FW1 1871 NA KEK <NA> 7 19 NA 7 12 <NA> <NA> N <NA> 137 746 178</span></span> <span id="cb12-9"><a href="#cb12-9" aria-hidden="true"></a><span class="co"># 5: NY2 1875 NA NNA <NA> 6 71 NA 30 38 <NA> <NA> N <NA> 328 2685 633</span></span> <span id="cb12-10"><a href="#cb12-10" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb12-11"><a href="#cb12-11" aria-hidden="true"></a><span class="co"># 145: ANA 2004 AL ANA W 1 162 81 92 70 Y N N N 836 5675 1603</span></span> <span id="cb12-12"><a href="#cb12-12" aria-hidden="true"></a><span class="co"># 146: ARI 2018 NL ARI W 3 162 81 82 80 N N N N 693 5460 1283</span></span> <span id="cb12-13"><a href="#cb12-13" aria-hidden="true"></a><span class="co"># 147: MIL 2018 NL MIL C 1 163 81 96 67 Y N N N 754 5542 1398</span></span> <span id="cb12-14"><a href="#cb12-14" aria-hidden="true"></a><span class="co"># 148: TBA 2018 AL </span><span class="al">TBD</span><span class="co"> E 3 162 81 90 72 N N N N 716 5475 1415</span></span> <span id="cb12-15"><a href="#cb12-15" aria-hidden="true"></a><span class="co"># 149: MIA 2018 NL FLA E 5 161 81 63 98 N N N N 589 5488 1303</span></span> <span id="cb12-16"><a href="#cb12-16" aria-hidden="true"></a><span class="co"># X2B X3B HR BB SO SB CS HBP SF RA ER ERA CG SHO SV IPouts HA HRA BBA SOA E DP</span></span> <span id="cb12-17"><a href="#cb12-17" aria-hidden="true"></a><span class="co"># 1: 167 51 15 33 52 93 37 NA NA 343 152 1.87 60 10 17 2196 751 2 33 110 483 56</span></span> <span id="cb12-18"><a href="#cb12-18" aria-hidden="true"></a><span class="co"># 2: 52 21 10 60 22 69 21 NA NA 241 77 2.76 25 0 1 753 308 6 28 22 229 16</span></span> <span id="cb12-19"><a href="#cb12-19" aria-hidden="true"></a><span class="co"># 3: 28 5 0 17 13 12 3 NA NA 254 126 5.70 15 0 0 597 285 6 24 11 184 17</span></span> <span id="cb12-20"><a href="#cb12-20" aria-hidden="true"></a><span class="co"># 4: 19 8 2 33 9 16 4 NA NA 243 97 5.17 19 1 0 507 261 5 21 17 163 8</span></span> <span id="cb12-21"><a href="#cb12-21" aria-hidden="true"></a><span class="co"># 5: 82 21 7 19 47 20 24 NA NA 425 174 2.46 70 3 0 1910 718 4 21 77 526 30</span></span> <span id="cb12-22"><a href="#cb12-22" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb12-23"><a href="#cb12-23" aria-hidden="true"></a><span class="co"># 145: 272 37 162 450 942 143 46 73 41 734 692 4.28 2 11 50 4363 1476 170 502 1164 90 126</span></span> <span id="cb12-24"><a href="#cb12-24" aria-hidden="true"></a><span class="co"># 146: 259 50 176 560 1460 79 25 52 45 644 605 3.72 2 9 39 4389 1313 174 522 1448 75 152</span></span> <span id="cb12-25"><a href="#cb12-25" aria-hidden="true"></a><span class="co"># 147: 252 24 218 537 1458 124 32 58 41 659 606 3.73 0 14 49 4383 1259 173 553 1428 108 141</span></span> <span id="cb12-26"><a href="#cb12-26" aria-hidden="true"></a><span class="co"># 148: 274 43 150 540 1388 128 51 101 50 646 602 3.74 0 14 52 4345 1236 164 501 1421 85 136</span></span> <span id="cb12-27"><a href="#cb12-27" aria-hidden="true"></a><span class="co"># 149: 222 24 128 455 1384 45 31 73 31 809 762 4.76 1 12 30 4326 1388 192 605 1249 83 133</span></span> <span id="cb12-28"><a href="#cb12-28" aria-hidden="true"></a><span class="co"># FP name park attendance BPF PPF teamIDBR</span></span> <span id="cb12-29"><a href="#cb12-29" aria-hidden="true"></a><span class="co"># 1: 0.870 Boston Red Stockings South End Grounds I NA 103 96 BOS</span></span> <span id="cb12-30"><a href="#cb12-30" aria-hidden="true"></a><span class="co"># 2: 0.829 Chicago White Stockings Union Base-Ball Grounds NA 104 102 CHI</span></span> <span id="cb12-31"><a href="#cb12-31" aria-hidden="true"></a><span class="co"># 3: 0.816 Cleveland Forest Citys National Association Grounds NA 96 100 CLE</span></span> <span id="cb12-32"><a href="#cb12-32" aria-hidden="true"></a><span class="co"># 4: 0.803 Fort Wayne Kekiongas Hamilton Field NA 101 107 KEK</span></span> <span id="cb12-33"><a href="#cb12-33" aria-hidden="true"></a><span class="co"># 5: 0.838 New York Mutuals Union Grounds (Brooklyn) NA 99 100 NYU</span></span> <span id="cb12-34"><a href="#cb12-34" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb12-35"><a href="#cb12-35" aria-hidden="true"></a><span class="co"># 145: 0.985 Anaheim Angels Angels Stadium of Anaheim 3375677 97 97 ANA</span></span> <span id="cb12-36"><a href="#cb12-36" aria-hidden="true"></a><span class="co"># 146: 0.988 Arizona Diamondbacks Chase Field 2242695 108 107 ARI</span></span> <span id="cb12-37"><a href="#cb12-37" aria-hidden="true"></a><span class="co"># 147: 0.982 Milwaukee Brewers Miller Park 2850875 102 101 MIL</span></span> <span id="cb12-38"><a href="#cb12-38" aria-hidden="true"></a><span class="co"># 148: 0.986 Tampa Bay Rays Tropicana Field 1154973 97 97 TBR</span></span> <span id="cb12-39"><a href="#cb12-39" aria-hidden="true"></a><span class="co"># 149: 0.986 Miami Marlins Marlins Park 811104 89 90 MIA</span></span> <span id="cb12-40"><a href="#cb12-40" aria-hidden="true"></a><span class="co"># teamIDlahman45 teamIDretro</span></span> <span id="cb12-41"><a href="#cb12-41" aria-hidden="true"></a><span class="co"># 1: BS1 BS1</span></span> <span id="cb12-42"><a href="#cb12-42" aria-hidden="true"></a><span class="co"># 2: CH1 CH1</span></span> <span id="cb12-43"><a href="#cb12-43" aria-hidden="true"></a><span class="co"># 3: CL1 CL1</span></span> <span id="cb12-44"><a href="#cb12-44" aria-hidden="true"></a><span class="co"># 4: FW1 FW1</span></span> <span id="cb12-45"><a href="#cb12-45" aria-hidden="true"></a><span class="co"># 5: NY2 NY2</span></span> <span id="cb12-46"><a href="#cb12-46" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb12-47"><a href="#cb12-47" aria-hidden="true"></a><span class="co"># 145: ANA ANA</span></span> <span id="cb12-48"><a href="#cb12-48" aria-hidden="true"></a><span class="co"># 146: ARI ARI</span></span> <span id="cb12-49"><a href="#cb12-49" aria-hidden="true"></a><span class="co"># 147: ML4 MIL</span></span> <span id="cb12-50"><a href="#cb12-50" aria-hidden="true"></a><span class="co"># 148: TBA TBA</span></span> <span id="cb12-51"><a href="#cb12-51" aria-hidden="true"></a><span class="co"># 149: FLO MIA</span></span></code></pre></div> <p>Recall that <code>.SD</code> is itself a <code>data.table</code>, and that <code>.N</code> refers to the total number of rows in a group (it’s equal to <code>nrow(.SD)</code> within each group), so <code>.SD[.N]</code> returns the <em>entirety of <code>.SD</code></em> for the final row associated with each <code>teamID</code>.</p> <p>Another common version of this is to use <code>.SD[1L]</code> instead to get the <em>first</em> observation for each group, or <code>.SD[sample(.N, 1L)]</code> to return a <em>random</em> row for each group.</p> </div> <div id="group-optima" class="section level2" number="3.2"> <h2 number="3.2"><span class="header-section-number">3.2</span> Group Optima</h2> <p>Suppose we wanted to return the <em>best</em> year for each team, as measured by their total number of runs scored (<code>R</code>; we could easily adjust this to refer to other metrics, of course). Instead of taking a <em>fixed</em> element from each sub-<code>data.table</code>, we now define the desired index <em>dynamically</em> as follows:</p> <div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true"></a>Teams[ , .SD[<span class="kw">which.max</span>(R)], by =<span class="st"> </span>teamID]</span> <span id="cb13-2"><a href="#cb13-2" aria-hidden="true"></a><span class="co"># teamID yearID lgID franchID divID Rank G Ghome W L DivWin WCWin LgWin WSWin R AB H</span></span> <span id="cb13-3"><a href="#cb13-3" aria-hidden="true"></a><span class="co"># 1: BS1 1875 NA BNA <NA> 1 82 NA 71 8 <NA> <NA> Y <NA> 831 3515 1128</span></span> <span id="cb13-4"><a href="#cb13-4" aria-hidden="true"></a><span class="co"># 2: CH1 1871 NA CNA <NA> 2 28 NA 19 9 <NA> <NA> N <NA> 302 1196 323</span></span> <span id="cb13-5"><a href="#cb13-5" aria-hidden="true"></a><span class="co"># 3: CL1 1871 NA CFC <NA> 8 29 NA 10 19 <NA> <NA> N <NA> 249 1186 328</span></span> <span id="cb13-6"><a href="#cb13-6" aria-hidden="true"></a><span class="co"># 4: FW1 1871 NA KEK <NA> 7 19 NA 7 12 <NA> <NA> N <NA> 137 746 178</span></span> <span id="cb13-7"><a href="#cb13-7" aria-hidden="true"></a><span class="co"># 5: NY2 1872 NA NNA <NA> 3 56 NA 34 20 <NA> <NA> N <NA> 523 2426 670</span></span> <span id="cb13-8"><a href="#cb13-8" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb13-9"><a href="#cb13-9" aria-hidden="true"></a><span class="co"># 145: ANA 2000 AL ANA W 3 162 81 82 80 N N N N 864 5628 1574</span></span> <span id="cb13-10"><a href="#cb13-10" aria-hidden="true"></a><span class="co"># 146: ARI 1999 NL ARI W 1 162 81 100 62 Y N N N 908 5658 1566</span></span> <span id="cb13-11"><a href="#cb13-11" aria-hidden="true"></a><span class="co"># 147: MIL 1999 NL MIL C 5 161 80 74 87 N N N N 815 5582 1524</span></span> <span id="cb13-12"><a href="#cb13-12" aria-hidden="true"></a><span class="co"># 148: TBA 2009 AL </span><span class="al">TBD</span><span class="co"> E 3 162 81 84 78 N N N N 803 5462 1434</span></span> <span id="cb13-13"><a href="#cb13-13" aria-hidden="true"></a><span class="co"># 149: MIA 2017 NL FLA E 2 162 78 77 85 N N N N 778 5602 1497</span></span> <span id="cb13-14"><a href="#cb13-14" aria-hidden="true"></a><span class="co"># X2B X3B HR BB SO SB CS HBP SF RA ER ERA CG SHO SV IPouts HA HRA BBA SOA E DP</span></span> <span id="cb13-15"><a href="#cb13-15" aria-hidden="true"></a><span class="co"># 1: 167 51 15 33 52 93 37 NA NA 343 152 1.87 60 10 17 2196 751 2 33 110 483 56</span></span> <span id="cb13-16"><a href="#cb13-16" aria-hidden="true"></a><span class="co"># 2: 52 21 10 60 22 69 21 NA NA 241 77 2.76 25 0 1 753 308 6 28 22 229 16</span></span> <span id="cb13-17"><a href="#cb13-17" aria-hidden="true"></a><span class="co"># 3: 35 40 7 26 25 18 8 NA NA 341 116 4.11 23 0 0 762 346 13 53 34 234 15</span></span> <span id="cb13-18"><a href="#cb13-18" aria-hidden="true"></a><span class="co"># 4: 19 8 2 33 9 16 4 NA NA 243 97 5.17 19 1 0 507 261 5 21 17 163 8</span></span> <span id="cb13-19"><a href="#cb13-19" aria-hidden="true"></a><span class="co"># 5: 87 14 4 58 52 59 22 NA NA 362 172 3.02 54 3 1 1536 622 2 33 46 323 33</span></span> <span id="cb13-20"><a href="#cb13-20" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb13-21"><a href="#cb13-21" aria-hidden="true"></a><span class="co"># 145: 309 34 236 608 1024 93 52 47 43 869 805 5.00 5 3 46 4344 1534 228 662 846 134 182</span></span> <span id="cb13-22"><a href="#cb13-22" aria-hidden="true"></a><span class="co"># 146: 289 46 216 588 1045 137 39 48 60 676 615 3.77 16 9 42 4402 1387 176 543 1198 104 132</span></span> <span id="cb13-23"><a href="#cb13-23" aria-hidden="true"></a><span class="co"># 147: 299 30 165 658 1065 81 33 55 51 886 813 5.07 2 5 40 4328 1618 213 616 987 127 146</span></span> <span id="cb13-24"><a href="#cb13-24" aria-hidden="true"></a><span class="co"># 148: 297 36 199 642 1229 194 61 49 45 754 686 4.33 3 5 41 4282 1421 183 515 1125 98 135</span></span> <span id="cb13-25"><a href="#cb13-25" aria-hidden="true"></a><span class="co"># 149: 271 31 194 486 1282 91 30 67 41 822 772 4.82 1 7 34 4328 1450 193 627 1202 73 156</span></span> <span id="cb13-26"><a href="#cb13-26" aria-hidden="true"></a><span class="co"># FP name park attendance BPF PPF teamIDBR</span></span> <span id="cb13-27"><a href="#cb13-27" aria-hidden="true"></a><span class="co"># 1: 0.870 Boston Red Stockings South End Grounds I NA 103 96 BOS</span></span> <span id="cb13-28"><a href="#cb13-28" aria-hidden="true"></a><span class="co"># 2: 0.829 Chicago White Stockings Union Base-Ball Grounds NA 104 102 CHI</span></span> <span id="cb13-29"><a href="#cb13-29" aria-hidden="true"></a><span class="co"># 3: 0.818 Cleveland Forest Citys National Association Grounds NA 96 100 CLE</span></span> <span id="cb13-30"><a href="#cb13-30" aria-hidden="true"></a><span class="co"># 4: 0.803 Fort Wayne Kekiongas Hamilton Field NA 101 107 KEK</span></span> <span id="cb13-31"><a href="#cb13-31" aria-hidden="true"></a><span class="co"># 5: 0.868 New York Mutuals Union Grounds (Brooklyn) NA 93 92 NYU</span></span> <span id="cb13-32"><a href="#cb13-32" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb13-33"><a href="#cb13-33" aria-hidden="true"></a><span class="co"># 145: 0.978 Anaheim Angels Edison International Field 2066982 102 103 ANA</span></span> <span id="cb13-34"><a href="#cb13-34" aria-hidden="true"></a><span class="co"># 146: 0.983 Arizona Diamondbacks Bank One Ballpark 3019654 101 101 ARI</span></span> <span id="cb13-35"><a href="#cb13-35" aria-hidden="true"></a><span class="co"># 147: 0.979 Milwaukee Brewers County Stadium 1701796 99 99 MIL</span></span> <span id="cb13-36"><a href="#cb13-36" aria-hidden="true"></a><span class="co"># 148: 0.983 Tampa Bay Rays Tropicana Field 1874962 98 97 TBR</span></span> <span id="cb13-37"><a href="#cb13-37" aria-hidden="true"></a><span class="co"># 149: 0.988 Miami Marlins Marlins Park 1583014 93 93 MIA</span></span> <span id="cb13-38"><a href="#cb13-38" aria-hidden="true"></a><span class="co"># teamIDlahman45 teamIDretro</span></span> <span id="cb13-39"><a href="#cb13-39" aria-hidden="true"></a><span class="co"># 1: BS1 BS1</span></span> <span id="cb13-40"><a href="#cb13-40" aria-hidden="true"></a><span class="co"># 2: CH1 CH1</span></span> <span id="cb13-41"><a href="#cb13-41" aria-hidden="true"></a><span class="co"># 3: CL1 CL1</span></span> <span id="cb13-42"><a href="#cb13-42" aria-hidden="true"></a><span class="co"># 4: FW1 FW1</span></span> <span id="cb13-43"><a href="#cb13-43" aria-hidden="true"></a><span class="co"># 5: NY2 NY2</span></span> <span id="cb13-44"><a href="#cb13-44" aria-hidden="true"></a><span class="co"># --- </span></span> <span id="cb13-45"><a href="#cb13-45" aria-hidden="true"></a><span class="co"># 145: ANA ANA</span></span> <span id="cb13-46"><a href="#cb13-46" aria-hidden="true"></a><span class="co"># 146: ARI ARI</span></span> <span id="cb13-47"><a href="#cb13-47" aria-hidden="true"></a><span class="co"># 147: ML4 MIL</span></span> <span id="cb13-48"><a href="#cb13-48" aria-hidden="true"></a><span class="co"># 148: TBA TBA</span></span> <span id="cb13-49"><a href="#cb13-49" aria-hidden="true"></a><span class="co"># 149: FLO MIA</span></span></code></pre></div> <p>Note that this approach can of course be combined with <code>.SDcols</code> to return only portions of the <code>data.table</code> for each <code>.SD</code> (with the caveat that <code>.SDcols</code> should be fixed across the various subsets)</p> <p><em>NB</em>: <code>.SD[1L]</code> is currently optimized by <a href="https://Rdatatable.gitlab.io/data.table/library/data.table/html/datatable-optimize.html"><em><code>GForce</code></em></a> (<a href="https://stackoverflow.com/questions/22137591/about-gforce-in-data-table-1-9-2">see also</a>), <code>data.table</code> internals which massively speed up the most common grouped operations like <code>sum</code> or <code>mean</code> – see <code>?GForce</code> for more details and keep an eye on/voice support for feature improvement requests for updates on this front: <a href="https://github.com/Rdatatable/data.table/issues/735">1</a>, <a href="https://github.com/Rdatatable/data.table/issues/2778">2</a>, <a href="https://github.com/Rdatatable/data.table/issues/523">3</a>, <a href="https://github.com/Rdatatable/data.table/issues/971">4</a>, <a href="https://github.com/Rdatatable/data.table/issues/1197">5</a>, <a href="https://github.com/Rdatatable/data.table/issues/1414">6</a></p> </div> <div id="grouped-regression" class="section level2" number="3.3"> <h2 number="3.3"><span class="header-section-number">3.3</span> Grouped Regression</h2> <p>Returning to the inquiry above regarding the relationship between <code>ERA</code> and <code>W</code>, suppose we expect this relationship to differ by team (i.e., there’s a different slope for each team). We can easily re-run this regression to explore the heterogeneity in this relationship as follows (noting that the standard errors from this approach are generally incorrect – the specification <code>ERA ~ W*teamID</code> will be better – this approach is easier to read and the <em>coefficients</em> are OK):</p> <div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true"></a><span class="co"># Overall coefficient for comparison</span></span> <span id="cb14-2"><a href="#cb14-2" aria-hidden="true"></a>overall_coef =<span class="st"> </span>Pitching[ , <span class="kw">coef</span>(<span class="kw">lm</span>(ERA <span class="op">~</span><span class="st"> </span>W))[<span class="st">'W'</span>]]</span> <span id="cb14-3"><a href="#cb14-3" aria-hidden="true"></a><span class="co"># use the .N > 20 filter to exclude teams with few observations</span></span> <span id="cb14-4"><a href="#cb14-4" aria-hidden="true"></a>Pitching[ , <span class="cf">if</span> (.N <span class="op">></span><span class="st"> </span>20L) .(<span class="dt">w_coef =</span> <span class="kw">coef</span>(<span class="kw">lm</span>(ERA <span class="op">~</span><span class="st"> </span>W))[<span class="st">'W'</span>]), by =<span class="st"> </span>teamID</span> <span id="cb14-5"><a href="#cb14-5" aria-hidden="true"></a> ][ , <span class="kw">hist</span>(w_coef, 20L, <span class="dt">las =</span> 1L,</span> <span id="cb14-6"><a href="#cb14-6" aria-hidden="true"></a> <span class="dt">xlab =</span> <span class="st">'Fitted Coefficient on W'</span>,</span> <span id="cb14-7"><a href="#cb14-7" aria-hidden="true"></a> <span class="dt">ylab =</span> <span class="st">'Number of Teams'</span>, <span class="dt">col =</span> <span class="st">'darkgreen'</span>,</span> <span id="cb14-8"><a href="#cb14-8" aria-hidden="true"></a> <span class="dt">main =</span> <span class="st">'Team-Level Distribution</span><span class="ch">\n</span><span class="st">Win Coefficients on ERA'</span>)]</span> <span id="cb14-9"><a href="#cb14-9" aria-hidden="true"></a><span class="kw">abline</span>(<span class="dt">v =</span> overall_coef, <span class="dt">lty =</span> 2L, <span class="dt">col =</span> <span class="st">'red'</span>)</span></code></pre></div> <p><img src="" width="100%" /></p> <p>While there is indeed a fair amount of heterogeneity, there’s a distinct concentration around the observed overall value.</p> <p>The above is just a short introduction of the power of <code>.SD</code> in facilitating beautiful, efficient code in <code>data.table</code>!</p> </div> </div> <!-- code folding --> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>