EVOLUTION-MANAGER
Edit File: janitor.html
<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="pandoc" /> <meta name="viewport" content="width=device-width, initial-scale=1"> <style type="text/css"> @font-face { font-family: octicons-link; src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAAAZwABAAAAAACFQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEU0lHAAAGaAAAAAgAAAAIAAAAAUdTVUIAAAZcAAAACgAAAAoAAQAAT1MvMgAAAyQAAABJAAAAYFYEU3RjbWFwAAADcAAAAEUAAACAAJThvmN2dCAAAATkAAAABAAAAAQAAAAAZnBnbQAAA7gAAACyAAABCUM+8IhnYXNwAAAGTAAAABAAAAAQABoAI2dseWYAAAFsAAABPAAAAZwcEq9taGVhZAAAAsgAAAA0AAAANgh4a91oaGVhAAADCAAAABoAAAAkCA8DRGhtdHgAAAL8AAAADAAAAAwGAACfbG9jYQAAAsAAAAAIAAAACABiATBtYXhwAAACqAAAABgAAAAgAA8ASm5hbWUAAAToAAABQgAAAlXu73sOcG9zdAAABiwAAAAeAAAAME3QpOBwcmVwAAAEbAAAAHYAAAB/aFGpk3jaTY6xa8JAGMW/O62BDi0tJLYQincXEypYIiGJjSgHniQ6umTsUEyLm5BV6NDBP8Tpts6F0v+k/0an2i+itHDw3v2+9+DBKTzsJNnWJNTgHEy4BgG3EMI9DCEDOGEXzDADU5hBKMIgNPZqoD3SilVaXZCER3/I7AtxEJLtzzuZfI+VVkprxTlXShWKb3TBecG11rwoNlmmn1P2WYcJczl32etSpKnziC7lQyWe1smVPy/Lt7Kc+0vWY/gAgIIEqAN9we0pwKXreiMasxvabDQMM4riO+qxM2ogwDGOZTXxwxDiycQIcoYFBLj5K3EIaSctAq2kTYiw+ymhce7vwM9jSqO8JyVd5RH9gyTt2+J/yUmYlIR0s04n6+7Vm1ozezUeLEaUjhaDSuXHwVRgvLJn1tQ7xiuVv/ocTRF42mNgZGBgYGbwZOBiAAFGJBIMAAizAFoAAABiAGIAznjaY2BkYGAA4in8zwXi+W2+MjCzMIDApSwvXzC97Z4Ig8N/BxYGZgcgl52BCSQKAA3jCV8CAABfAAAAAAQAAEB42mNgZGBg4f3vACQZQABIMjKgAmYAKEgBXgAAeNpjYGY6wTiBgZWBg2kmUxoDA4MPhGZMYzBi1AHygVLYQUCaawqDA4PChxhmh/8ODDEsvAwHgMKMIDnGL0x7gJQCAwMAJd4MFwAAAHjaY2BgYGaA4DAGRgYQkAHyGMF8NgYrIM3JIAGVYYDT+AEjAwuDFpBmA9KMDEwMCh9i/v8H8sH0/4dQc1iAmAkALaUKLgAAAHjaTY9LDsIgEIbtgqHUPpDi3gPoBVyRTmTddOmqTXThEXqrob2gQ1FjwpDvfwCBdmdXC5AVKFu3e5MfNFJ29KTQT48Ob9/lqYwOGZxeUelN2U2R6+cArgtCJpauW7UQBqnFkUsjAY/kOU1cP+DAgvxwn1chZDwUbd6CFimGXwzwF6tPbFIcjEl+vvmM/byA48e6tWrKArm4ZJlCbdsrxksL1AwWn/yBSJKpYbq8AXaaTb8AAHja28jAwOC00ZrBeQNDQOWO//sdBBgYGRiYWYAEELEwMTE4uzo5Zzo5b2BxdnFOcALxNjA6b2ByTswC8jYwg0VlNuoCTWAMqNzMzsoK1rEhNqByEyerg5PMJlYuVueETKcd/89uBpnpvIEVomeHLoMsAAe1Id4AAAAAAAB42oWQT07CQBTGv0JBhagk7HQzKxca2sJCE1hDt4QF+9JOS0nbaaYDCQfwCJ7Au3AHj+LO13FMmm6cl7785vven0kBjHCBhfpYuNa5Ph1c0e2Xu3jEvWG7UdPDLZ4N92nOm+EBXuAbHmIMSRMs+4aUEd4Nd3CHD8NdvOLTsA2GL8M9PODbcL+hD7C1xoaHeLJSEao0FEW14ckxC+TU8TxvsY6X0eLPmRhry2WVioLpkrbp84LLQPGI7c6sOiUzpWIWS5GzlSgUzzLBSikOPFTOXqly7rqx0Z1Q5BAIoZBSFihQYQOOBEdkCOgXTOHA07HAGjGWiIjaPZNW13/+lm6S9FT7rLHFJ6fQbkATOG1j2OFMucKJJsxIVfQORl+9Jyda6Sl1dUYhSCm1dyClfoeDve4qMYdLEbfqHf3O/AdDumsjAAB42mNgYoAAZQYjBmyAGYQZmdhL8zLdDEydARfoAqIAAAABAAMABwAKABMAB///AA8AAQAAAAAAAAAAAAAAAAABAAAAAA==) format('woff'); } body { -webkit-text-size-adjust: 100%; text-size-adjust: 100%; color: #333; font-family: "Helvetica Neue", Helvetica, "Segoe UI", Arial, freesans, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; font-size: 16px; line-height: 1.6; word-wrap: break-word; } a { background-color: transparent; } a:active, a:hover { outline: 0; } strong { font-weight: bold; } h1 { font-size: 2em; margin: 0.67em 0; } img { border: 0; } hr { box-sizing: content-box; height: 0; } pre { overflow: auto; } code, kbd, pre { font-family: monospace, monospace; font-size: 1em; } input { color: inherit; font: inherit; margin: 0; } html input[disabled] { cursor: default; } input { line-height: normal; } input[type="checkbox"] { box-sizing: border-box; padding: 0; } table { border-collapse: collapse; border-spacing: 0; } td, th { padding: 0; } * { box-sizing: border-box; } input { font: 13px / 1.4 Helvetica, arial, nimbussansl, liberationsans, freesans, clean, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; } a { color: #4078c0; text-decoration: none; } a:hover, a:active { text-decoration: underline; } hr { height: 0; margin: 15px 0; overflow: hidden; background: transparent; border: 0; border-bottom: 1px solid #ddd; } hr:before { display: table; content: ""; } hr:after { display: table; clear: both; content: ""; } h1, h2, h3, h4, h5, h6 { margin-top: 15px; margin-bottom: 15px; line-height: 1.1; } h1 { font-size: 30px; } h2 { font-size: 21px; } h3 { font-size: 16px; } h4 { font-size: 14px; } h5 { font-size: 12px; } h6 { font-size: 11px; } blockquote { margin: 0; } ul, ol { padding: 0; margin-top: 0; margin-bottom: 0; } ol ol, ul ol { list-style-type: lower-roman; } ul ul ol, ul ol ol, ol ul ol, ol ol ol { list-style-type: lower-alpha; } dd { margin-left: 0; } code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace; font-size: 12px; } pre { margin-top: 0; margin-bottom: 0; font: 12px Consolas, "Liberation Mono", Menlo, Courier, monospace; } .select::-ms-expand { opacity: 0; } .octicon { font: normal normal normal 16px/1 octicons-link; display: inline-block; text-decoration: none; text-rendering: auto; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; -webkit-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; } .octicon-link:before { content: '\f05c'; } .markdown-body:before { display: table; content: ""; } .markdown-body:after { display: table; clear: both; content: ""; } .markdown-body>*:first-child { margin-top: 0 !important; } .markdown-body>*:last-child { margin-bottom: 0 !important; } a:not([href]) { color: inherit; text-decoration: none; } .anchor { display: inline-block; padding-right: 2px; margin-left: -18px; } .anchor:focus { outline: none; } h1, h2, h3, h4, h5, h6 { margin-top: 1em; margin-bottom: 16px; font-weight: bold; line-height: 1.4; } h1 .octicon-link, h2 .octicon-link, h3 .octicon-link, h4 .octicon-link, h5 .octicon-link, h6 .octicon-link { color: #000; vertical-align: middle; visibility: hidden; } h1:hover .anchor, h2:hover .anchor, h3:hover .anchor, h4:hover .anchor, h5:hover .anchor, h6:hover .anchor { text-decoration: none; } h1:hover .anchor .octicon-link, h2:hover .anchor .octicon-link, h3:hover .anchor .octicon-link, h4:hover .anchor .octicon-link, h5:hover .anchor .octicon-link, h6:hover .anchor .octicon-link { visibility: visible; } h1 { padding-bottom: 0.3em; font-size: 2.25em; line-height: 1.2; border-bottom: 1px solid #eee; } h1 .anchor { line-height: 1; } h2 { padding-bottom: 0.3em; font-size: 1.75em; line-height: 1.225; border-bottom: 1px solid #eee; } h2 .anchor { line-height: 1; } h3 { font-size: 1.5em; line-height: 1.43; } h3 .anchor { line-height: 1.2; } h4 { font-size: 1.25em; } h4 .anchor { line-height: 1.2; } h5 { font-size: 1em; } h5 .anchor { line-height: 1.1; } h6 { font-size: 1em; color: #777; } h6 .anchor { line-height: 1.1; } p, blockquote, ul, ol, dl, table, pre { margin-top: 0; margin-bottom: 16px; } hr { height: 4px; padding: 0; margin: 16px 0; background-color: #e7e7e7; border: 0 none; } ul, ol { padding-left: 2em; } ul ul, ul ol, ol ol, ol ul { margin-top: 0; margin-bottom: 0; } li>p { margin-top: 16px; } dl { padding: 0; } dl dt { padding: 0; margin-top: 16px; font-size: 1em; font-style: italic; font-weight: bold; } dl dd { padding: 0 16px; margin-bottom: 16px; } blockquote { padding: 0 15px; color: #777; border-left: 4px solid #ddd; } blockquote>:first-child { margin-top: 0; } blockquote>:last-child { margin-bottom: 0; } table { display: block; width: 100%; overflow: auto; word-break: normal; word-break: keep-all; } table th { font-weight: bold; } table th, table td { padding: 6px 13px; border: 1px solid #ddd; } table tr { background-color: #fff; border-top: 1px solid #ccc; } table tr:nth-child(2n) { background-color: #f8f8f8; } img { max-width: 100%; box-sizing: content-box; background-color: #fff; } code { padding: 0; padding-top: 0.2em; padding-bottom: 0.2em; margin: 0; font-size: 85%; background-color: rgba(0,0,0,0.04); border-radius: 3px; } code:before, code:after { letter-spacing: -0.2em; content: "\00a0"; } pre>code { padding: 0; margin: 0; font-size: 100%; word-break: normal; white-space: pre; background: transparent; border: 0; } .highlight { margin-bottom: 16px; } .highlight pre, pre { padding: 16px; overflow: auto; font-size: 85%; line-height: 1.45; background-color: #f7f7f7; border-radius: 3px; } .highlight pre { margin-bottom: 0; word-break: normal; } pre { word-wrap: normal; } pre code { display: inline; max-width: initial; padding: 0; margin: 0; overflow: initial; line-height: inherit; word-wrap: normal; background-color: transparent; border: 0; } pre code:before, pre code:after { content: normal; } kbd { display: inline-block; padding: 3px 5px; font-size: 11px; line-height: 10px; color: #555; vertical-align: middle; background-color: #fcfcfc; border: solid 1px #ccc; border-bottom-color: #bbb; border-radius: 3px; box-shadow: inset 0 -1px 0 #bbb; } .pl-c { color: #969896; } .pl-c1, .pl-s .pl-v { color: #0086b3; } .pl-e, .pl-en { color: #795da3; } .pl-s .pl-s1, .pl-smi { color: #333; } .pl-ent { color: #63a35c; } .pl-k { color: #a71d5d; } .pl-pds, .pl-s, .pl-s .pl-pse .pl-s1, .pl-sr, .pl-sr .pl-cce, .pl-sr .pl-sra, .pl-sr .pl-sre { color: #183691; } .pl-v { color: #ed6a43; } .pl-id { color: #b52a1d; } .pl-ii { background-color: #b52a1d; color: #f8f8f8; } .pl-sr .pl-cce { color: #63a35c; font-weight: bold; } .pl-ml { color: #693a17; } .pl-mh, .pl-mh .pl-en, .pl-ms { color: #1d3e81; font-weight: bold; } .pl-mq { color: #008080; } .pl-mi { color: #333; font-style: italic; } .pl-mb { color: #333; font-weight: bold; } .pl-md { background-color: #ffecec; color: #bd2c00; } .pl-mi1 { background-color: #eaffea; color: #55a532; } .pl-mdr { color: #795da3; font-weight: bold; } .pl-mo { color: #1d3e81; } kbd { display: inline-block; padding: 3px 5px; font: 11px Consolas, "Liberation Mono", Menlo, Courier, monospace; line-height: 10px; color: #555; vertical-align: middle; background-color: #fcfcfc; border: solid 1px #ccc; border-bottom-color: #bbb; border-radius: 3px; box-shadow: inset 0 -1px 0 #bbb; } .task-list-item { list-style-type: none; } .task-list-item+.task-list-item { margin-top: 3px; } .task-list-item input { margin: 0 0.35em 0.25em -1.6em; vertical-align: middle; } :checked+.radio-label { z-index: 1; position: relative; border-color: #4078c0; } .sourceLine { display: inline-block; } code .kw { color: #000000; } code .dt { color: #ed6a43; } code .dv { color: #009999; } code .bn { color: #009999; } code .fl { color: #009999; } code .ch { color: #009999; } code .st { color: #183691; } code .co { color: #969896; } code .ot { color: #0086b3; } code .al { color: #a61717; } code .fu { color: #63a35c; } code .er { color: #a61717; background-color: #e3d2d2; } code .wa { color: #000000; } code .cn { color: #008080; } code .sc { color: #008080; } code .vs { color: #183691; } code .ss { color: #183691; } code .im { color: #000000; } code .va {color: #008080; } code .cf { color: #000000; } code .op { color: #000000; } code .bu { color: #000000; } code .ex { color: #000000; } code .pp { color: #999999; } code .at { color: #008080; } code .do { color: #969896; } code .an { color: #008080; } code .cv { color: #008080; } code .in { color: #008080; } </style> <style> body { box-sizing: border-box; min-width: 200px; max-width: 980px; margin: 0 auto; padding: 45px; padding-top: 0px; } </style> </head> <body> <h1 id="overview-of-janitor-functions">Overview of janitor functions</h1> <p>2021-01-04</p> <ul> <li><a href="#major-functions">Major functions</a> <ul> <li><a href="#cleaning">Cleaning</a> <ul> <li><a href="#clean-data.frame-names-with-clean_names">Clean data.frame names with <code>clean_names()</code></a></li> <li><a href="#do-those-data.frames-actually-contain-the-same-columns">Do those data.frames actually contain the same columns?</a></li> </ul></li> <li><a href="#exploring">Exploring</a> <ul> <li><a href="#tabyl---a-better-version-of-table"><code>tabyl()</code> - a better version of <code>table()</code></a></li> <li><a href="#explore-records-with-duplicated-values-for-specific-combinations-of-variables-with-get_dupes">Explore records with duplicated values for specific combinations of variables with <code>get_dupes()</code></a></li> </ul></li> </ul></li> <li><a href="#minor-functions">Minor functions</a> <ul> <li><a href="#cleaning-1">Cleaning</a> <ul> <li><a href="#manipulate-vectors-of-names-with-make_clean_names">Manipulate vectors of names with <code>make_clean_names()</code></a></li> <li><a href="#remove_empty-rows-and-columns"><code>remove_empty()</code> rows and columns</a></li> <li><a href="#remove_constant-columns"><code>remove_constant()</code> columns</a></li> <li><a href="#directionally-consistent-rounding-behavior-with-round_half_up">Directionally-consistent rounding behavior with <code>round_half_up()</code></a></li> <li><a href="#round-decimals-to-precise-fractions-of-a-given-denominator-with-round_to_fraction">Round decimals to precise fractions of a given denominator with <code>round_to_fraction()</code></a></li> <li><a href="#fix-dates-stored-as-serial-numbers-with-excel_numeric_to_date">Fix dates stored as serial numbers with <code>excel_numeric_to_date()</code></a></li> <li><a href="#convert-a-mix-of-date-and-datetime-formats-to-date">Convert a mix of date and datetime formats to date</a></li> <li><a href="#elevate-column-names-stored-in-a-data.frame-row">Elevate column names stored in a data.frame row</a></li> </ul></li> <li><a href="#exploring-1">Exploring</a> <ul> <li><a href="#count-factor-levels-in-groups-of-high-medium-and-low-with-top_levels">Count factor levels in groups of high, medium, and low with <code>top_levels()</code></a></li> </ul></li> </ul></li> </ul> <p>The janitor functions expedite the initial data exploration and cleaning that comes with any new data set. This catalog describes the usage for each function.</p> <h1 id="major-functions">Major functions</h1> <p>Functions for everyday use.</p> <h2 id="cleaning">Cleaning</h2> <h3 id="clean-dataframe-names-with-clean_names">Clean data.frame names with <code>clean_names()</code></h3> <p>Call this function every time you read data.</p> <p>It works in a <code>%>%</code> pipeline, and handles problematic variable names, especially those that are so well-preserved by <code>readxl::read_excel()</code> and <code>readr::read_csv()</code>.</p> <ul> <li>Parses letter cases and separators to a consistent format. <ul> <li>Default is to snake_case, but other cases like camelCase are available</li> </ul></li> <li>Handles special characters and spaces, including transliterating characters like <code>œ</code> to <code>oe</code>.</li> <li>Appends numbers to duplicated names</li> <li>Converts “%” to “percent” and “#” to “number” to retain meaning</li> <li>Spacing (or lack thereof) around numbers is preserved</li> </ul> <!-- end list --> <div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" title="1"><span class="co"># Create a data.frame with dirty names</span></a> <a class="sourceLine" id="cb1-2" title="2">test_df <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">matrix</span>(<span class="dt">ncol =</span> <span class="dv">6</span>))</a> <a class="sourceLine" id="cb1-3" title="3"><span class="kw">names</span>(test_df) <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"firstName"</span>, <span class="st">"ábc@!*"</span>, <span class="st">"% successful (2009)"</span>,</a> <a class="sourceLine" id="cb1-4" title="4"> <span class="st">"REPEAT VALUE"</span>, <span class="st">"REPEAT VALUE"</span>, <span class="st">""</span>)</a></code></pre></div> <p>Clean the variable names, returning a data.frame:</p> <div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb2-1" title="1">test_df <span class="op">%>%</span></a> <a class="sourceLine" id="cb2-2" title="2"><span class="st"> </span><span class="kw">clean_names</span>()</a> <a class="sourceLine" id="cb2-3" title="3"><span class="co">#> first_name abc percent_successful_2009 repeat_value repeat_value_2 x</span></a> <a class="sourceLine" id="cb2-4" title="4"><span class="co">#> 1 NA NA NA NA NA NA</span></a></code></pre></div> <p>Compare to what base R produces:</p> <div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb3-1" title="1"><span class="kw">make.names</span>(<span class="kw">names</span>(test_df))</a> <a class="sourceLine" id="cb3-2" title="2"><span class="co">#> [1] "firstName" "ábc..." "X..successful..2009."</span></a> <a class="sourceLine" id="cb3-3" title="3"><span class="co">#> [4] "REPEAT.VALUE" "REPEAT.VALUE" "X"</span></a></code></pre></div> <p>This function is powered by the underlying exported function <strong><code>make_clean_names()</code></strong>, which accepts and returns a character vector of names (see below). This allows for cleaning the names of <em>any</em> object, not just a data.frame. <code>clean_names()</code> is retained for its convenience in piped workflows, and can be called on an <code>sf</code> simple features object or a <code>tbl_graph</code> tidygraph object in addition to a data.frame.</p> <h3 id="do-those-dataframes-actually-contain-the-same-columns">Do those data.frames actually contain the same columns?</h3> <h4 id="check-with-compare_df_cols">Check with <code>compare_df_cols()</code></h4> <p>For cases when you are given a set of data files that <em>should</em> be identical, and you wish to read and combine them for analysis. But then <code>dplyr::bind_rows()</code> or <code>rbind()</code> fails, because of different columns or because the column classes don’t match across data.frames.</p> <p><code>compare_df_cols()</code> takes unquoted names of data.frames / tibbles, or a list of data.frames, and returns a summary of how they compare. See what the column types are, which are missing or present in the different inputs, and how column types differ.</p> <div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb4-1" title="1">df1 <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">a =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">2</span>, <span class="dt">b =</span> <span class="kw">c</span>(<span class="st">"big"</span>, <span class="st">"small"</span>))</a> <a class="sourceLine" id="cb4-2" title="2">df2 <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">a =</span> <span class="dv">10</span><span class="op">:</span><span class="dv">12</span>, <span class="dt">b =</span> <span class="kw">c</span>(<span class="st">"medium"</span>, <span class="st">"small"</span>, <span class="st">"big"</span>), <span class="dt">c =</span> <span class="dv">0</span>, <span class="dt">stringsAsFactors =</span> <span class="ot">TRUE</span>) <span class="co"># here, column b is a factor</span></a> <a class="sourceLine" id="cb4-3" title="3">df3 <-<span class="st"> </span>df1 <span class="op">%>%</span></a> <a class="sourceLine" id="cb4-4" title="4"><span class="st"> </span>dplyr<span class="op">::</span><span class="kw">mutate</span>(<span class="dt">b =</span> <span class="kw">as.character</span>(b))</a> <a class="sourceLine" id="cb4-5" title="5"></a> <a class="sourceLine" id="cb4-6" title="6"><span class="kw">compare_df_cols</span>(df1, df2, df3)</a> <a class="sourceLine" id="cb4-7" title="7"><span class="co">#> column_name df1 df2 df3</span></a> <a class="sourceLine" id="cb4-8" title="8"><span class="co">#> 1 a integer integer integer</span></a> <a class="sourceLine" id="cb4-9" title="9"><span class="co">#> 2 b character factor character</span></a> <a class="sourceLine" id="cb4-10" title="10"><span class="co">#> 3 c <NA> numeric <NA></span></a> <a class="sourceLine" id="cb4-11" title="11"></a> <a class="sourceLine" id="cb4-12" title="12"><span class="kw">compare_df_cols</span>(df1, df2, df3, <span class="dt">return =</span> <span class="st">"mismatch"</span>)</a> <a class="sourceLine" id="cb4-13" title="13"><span class="co">#> column_name df1 df2 df3</span></a> <a class="sourceLine" id="cb4-14" title="14"><span class="co">#> 1 b character factor character</span></a> <a class="sourceLine" id="cb4-15" title="15"><span class="kw">compare_df_cols</span>(df1, df2, df3, <span class="dt">return =</span> <span class="st">"mismatch"</span>, <span class="dt">bind_method =</span> <span class="st">"rbind"</span>) <span class="co"># default is dplyr::bind_rows</span></a> <a class="sourceLine" id="cb4-16" title="16"><span class="co">#> column_name df1 df2 df3</span></a> <a class="sourceLine" id="cb4-17" title="17"><span class="co">#> 1 b character factor character</span></a> <a class="sourceLine" id="cb4-18" title="18"><span class="co">#> 2 c <NA> numeric <NA></span></a></code></pre></div> <p><code>compare_df_cols_same()</code> returns <code>TRUE</code> or <code>FALSE</code> indicating if the data.frames can be successfully row-bound with the given binding method:</p> <div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb5-1" title="1"><span class="kw">compare_df_cols_same</span>(df1, df3)</a> <a class="sourceLine" id="cb5-2" title="2"><span class="co">#> [1] TRUE</span></a> <a class="sourceLine" id="cb5-3" title="3"><span class="kw">compare_df_cols_same</span>(df2, df3)</a> <a class="sourceLine" id="cb5-4" title="4"><span class="co">#> column_name ..1 ..2</span></a> <a class="sourceLine" id="cb5-5" title="5"><span class="co">#> 1 b factor character</span></a> <a class="sourceLine" id="cb5-6" title="6"><span class="co">#> [1] FALSE</span></a></code></pre></div> <h2 id="exploring">Exploring</h2> <h3 id="tabyl---a-better-version-of-table"><code>tabyl()</code> - a better version of <code>table()</code></h3> <p><code>tabyl()</code> is a tidyverse-oriented replacement for <code>table()</code>. It counts combinations of one, two, or three variables, and then can be formatted with a suite of <code>adorn_*</code> functions to look just how you want. For instance:</p> <div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb6-1" title="1">mtcars <span class="op">%>%</span></a> <a class="sourceLine" id="cb6-2" title="2"><span class="st"> </span><span class="kw">tabyl</span>(gear, cyl) <span class="op">%>%</span></a> <a class="sourceLine" id="cb6-3" title="3"><span class="st"> </span><span class="kw">adorn_totals</span>(<span class="st">"col"</span>) <span class="op">%>%</span></a> <a class="sourceLine" id="cb6-4" title="4"><span class="st"> </span><span class="kw">adorn_percentages</span>(<span class="st">"row"</span>) <span class="op">%>%</span></a> <a class="sourceLine" id="cb6-5" title="5"><span class="st"> </span><span class="kw">adorn_pct_formatting</span>(<span class="dt">digits =</span> <span class="dv">2</span>) <span class="op">%>%</span></a> <a class="sourceLine" id="cb6-6" title="6"><span class="st"> </span><span class="kw">adorn_ns</span>() <span class="op">%>%</span></a> <a class="sourceLine" id="cb6-7" title="7"><span class="st"> </span><span class="kw">adorn_title</span>()</a> <a class="sourceLine" id="cb6-8" title="8"><span class="co">#> cyl </span></a> <a class="sourceLine" id="cb6-9" title="9"><span class="co">#> gear 4 6 8 Total</span></a> <a class="sourceLine" id="cb6-10" title="10"><span class="co">#> 3 6.67% (1) 13.33% (2) 80.00% (12) 100.00% (15)</span></a> <a class="sourceLine" id="cb6-11" title="11"><span class="co">#> 4 66.67% (8) 33.33% (4) 0.00% (0) 100.00% (12)</span></a> <a class="sourceLine" id="cb6-12" title="12"><span class="co">#> 5 40.00% (2) 20.00% (1) 40.00% (2) 100.00% (5)</span></a></code></pre></div> <p>Learn more in the <a href="http://sfirke.github.io/janitor/articles/tabyls.html">tabyls vignette</a>.</p> <h3 id="explore-records-with-duplicated-values-for-specific-combinations-of-variables-with-get_dupes">Explore records with duplicated values for specific combinations of variables with <code>get_dupes()</code></h3> <p>This is for hunting down and examining duplicate records during data cleaning - usually when there shouldn’t be any.</p> <p>For example, in a tidy data.frame you might expect to have a unique ID repeated for each year, but no duplicated pairs of unique ID & year. Say you want to check for and study any such duplicated records.</p> <p><code>get_dupes()</code> returns the records (and inserts a count of duplicates) so you can examine the problematic cases:</p> <div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb7-1" title="1"><span class="kw">get_dupes</span>(mtcars, wt, cyl) <span class="co"># or mtcars %>% get_dupes(wt, cyl) if you prefer to pipe</span></a> <a class="sourceLine" id="cb7-2" title="2"><span class="co">#> wt cyl dupe_count mpg disp hp drat qsec vs am gear carb</span></a> <a class="sourceLine" id="cb7-3" title="3"><span class="co">#> 1 3.44 6 2 19.2 167.6 123 3.92 18.30 1 0 4 4</span></a> <a class="sourceLine" id="cb7-4" title="4"><span class="co">#> 2 3.44 6 2 17.8 167.6 123 3.92 18.90 1 0 4 4</span></a> <a class="sourceLine" id="cb7-5" title="5"><span class="co">#> 3 3.57 8 2 14.3 360.0 245 3.21 15.84 0 0 3 4</span></a> <a class="sourceLine" id="cb7-6" title="6"><span class="co">#> 4 3.57 8 2 15.0 301.0 335 3.54 14.60 0 1 5 8</span></a></code></pre></div> <h1 id="minor-functions">Minor functions</h1> <p>Smaller functions for use in particular situations. More human-readable than the equivalent code they replace.</p> <h2 id="cleaning-1">Cleaning</h2> <h3 id="manipulate-vectors-of-names-with-make_clean_names">Manipulate vectors of names with <code>make_clean_names()</code></h3> <p>Like base R’s <code>make.names()</code>, but with the stylings and case choice of the long-time janitor function <code>clean_names()</code>. While <code>clean_names()</code> is still offered for use in data.frame pipeline with <code>%>%</code>, <code>make_clean_names()</code> allows for more general usage, e.g., on a vector.</p> <p>It can also be used as an argument to <code>.name_repair</code> in the newest version of <code>tibble::as_tibble</code>:</p> <div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb8-1" title="1">tibble<span class="op">::</span><span class="kw">as_tibble</span>(iris, <span class="dt">.name_repair =</span> janitor<span class="op">::</span>make_clean_names)</a> <a class="sourceLine" id="cb8-2" title="2"><span class="co">#> # A tibble: 150 x 5</span></a> <a class="sourceLine" id="cb8-3" title="3"><span class="co">#> sepal_length sepal_width petal_length petal_width species</span></a> <a class="sourceLine" id="cb8-4" title="4"><span class="co">#> <dbl> <dbl> <dbl> <dbl> <fct> </span></a> <a class="sourceLine" id="cb8-5" title="5"><span class="co">#> 1 5.1 3.5 1.4 0.2 setosa </span></a> <a class="sourceLine" id="cb8-6" title="6"><span class="co">#> 2 4.9 3 1.4 0.2 setosa </span></a> <a class="sourceLine" id="cb8-7" title="7"><span class="co">#> 3 4.7 3.2 1.3 0.2 setosa </span></a> <a class="sourceLine" id="cb8-8" title="8"><span class="co">#> 4 4.6 3.1 1.5 0.2 setosa </span></a> <a class="sourceLine" id="cb8-9" title="9"><span class="co">#> 5 5 3.6 1.4 0.2 setosa </span></a> <a class="sourceLine" id="cb8-10" title="10"><span class="co">#> 6 5.4 3.9 1.7 0.4 setosa </span></a> <a class="sourceLine" id="cb8-11" title="11"><span class="co">#> 7 4.6 3.4 1.4 0.3 setosa </span></a> <a class="sourceLine" id="cb8-12" title="12"><span class="co">#> 8 5 3.4 1.5 0.2 setosa </span></a> <a class="sourceLine" id="cb8-13" title="13"><span class="co">#> 9 4.4 2.9 1.4 0.2 setosa </span></a> <a class="sourceLine" id="cb8-14" title="14"><span class="co">#> 10 4.9 3.1 1.5 0.1 setosa </span></a> <a class="sourceLine" id="cb8-15" title="15"><span class="co">#> # ... with 140 more rows</span></a></code></pre></div> <h3 id="remove_empty-rows-and-columns"><code>remove_empty()</code> rows and columns</h3> <p>Does what it says. For cases like cleaning Excel files that contain empty rows and columns after being read into R.</p> <div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb9-1" title="1">q <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">v1 =</span> <span class="kw">c</span>(<span class="dv">1</span>, <span class="ot">NA</span>, <span class="dv">3</span>),</a> <a class="sourceLine" id="cb9-2" title="2"> <span class="dt">v2 =</span> <span class="kw">c</span>(<span class="ot">NA</span>, <span class="ot">NA</span>, <span class="ot">NA</span>),</a> <a class="sourceLine" id="cb9-3" title="3"> <span class="dt">v3 =</span> <span class="kw">c</span>(<span class="st">"a"</span>, <span class="ot">NA</span>, <span class="st">"b"</span>))</a> <a class="sourceLine" id="cb9-4" title="4">q <span class="op">%>%</span></a> <a class="sourceLine" id="cb9-5" title="5"><span class="st"> </span><span class="kw">remove_empty</span>(<span class="kw">c</span>(<span class="st">"rows"</span>, <span class="st">"cols"</span>))</a> <a class="sourceLine" id="cb9-6" title="6"><span class="co">#> v1 v3</span></a> <a class="sourceLine" id="cb9-7" title="7"><span class="co">#> 1 1 a</span></a> <a class="sourceLine" id="cb9-8" title="8"><span class="co">#> 3 3 b</span></a></code></pre></div> <p>Just a simple wrapper for one-line functions, but it saves a little thinking for both the code writer and the reader.</p> <h3 id="remove_constant-columns"><code>remove_constant()</code> columns</h3> <p>Drops columns from a data.frame that contain only a single constant value (with an <code>na.rm</code> option to control whether NAs should be considered as different values from the constant).</p> <p><code>remove_constant</code> and <code>remove_empty</code> work on matrices as well as data.frames.</p> <div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb10-1" title="1">a <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">good =</span> <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>, <span class="dt">boring =</span> <span class="st">"the same"</span>)</a> <a class="sourceLine" id="cb10-2" title="2">a <span class="op">%>%</span><span class="st"> </span><span class="kw">remove_constant</span>()</a> <a class="sourceLine" id="cb10-3" title="3"><span class="co">#> good</span></a> <a class="sourceLine" id="cb10-4" title="4"><span class="co">#> 1 1</span></a> <a class="sourceLine" id="cb10-5" title="5"><span class="co">#> 2 2</span></a> <a class="sourceLine" id="cb10-6" title="6"><span class="co">#> 3 3</span></a></code></pre></div> <h3 id="directionally-consistent-rounding-behavior-with-round_half_up">Directionally-consistent rounding behavior with <code>round_half_up()</code></h3> <p>R uses “banker’s rounding”, i.e., halves are rounded to the nearest <em>even</em> number. This function, an exact implementation of <a href="https://stackoverflow.com/questions/12688717/round-up-from-5/12688836#12688836">https://stackoverflow.com/questions/12688717/round-up-from-5/12688836#12688836</a>, will round all halves up. Compare:</p> <div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb11-1" title="1">nums <-<span class="st"> </span><span class="kw">c</span>(<span class="fl">2.5</span>, <span class="fl">3.5</span>)</a> <a class="sourceLine" id="cb11-2" title="2"><span class="kw">round</span>(nums)</a> <a class="sourceLine" id="cb11-3" title="3"><span class="co">#> [1] 2 4</span></a> <a class="sourceLine" id="cb11-4" title="4"><span class="kw">round_half_up</span>(nums)</a> <a class="sourceLine" id="cb11-5" title="5"><span class="co">#> [1] 3 4</span></a></code></pre></div> <h3 id="round-decimals-to-precise-fractions-of-a-given-denominator-with-round_to_fraction">Round decimals to precise fractions of a given denominator with <code>round_to_fraction()</code></h3> <p>Say your data should only have values of quarters: 0, 0.25, 0.5, 0.75, 1, etc. But there are either user-entered bad values like <code>0.2</code> or floating-point precision problems like <code>0.25000000001</code>. <code>round_to_fraction()</code> will enforce the desired fractional distribution by rounding the values to the nearest value given the specified denominator.</p> <p>There’s also a <code>digits</code> argument for optional subsequent rounding.</p> <h3 id="fix-dates-stored-as-serial-numbers-with-excel_numeric_to_date">Fix dates stored as serial numbers with <code>excel_numeric_to_date()</code></h3> <p>Ever load data from Excel and see a value like <code>42223</code> where a date should be? This function converts those serial numbers to class <code>Date</code>, with options for different Excel date encoding systems, preserving fractions of a date as time (in which case the returned value is of class <code>POSIXlt</code>), and specifying a time zone.</p> <div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb12-1" title="1"><span class="kw">excel_numeric_to_date</span>(<span class="dv">41103</span>)</a> <a class="sourceLine" id="cb12-2" title="2"><span class="co">#> [1] "2012-07-13"</span></a> <a class="sourceLine" id="cb12-3" title="3"><span class="kw">excel_numeric_to_date</span>(<span class="fl">41103.01</span>) <span class="co"># ignores decimal places, returns Date object</span></a> <a class="sourceLine" id="cb12-4" title="4"><span class="co">#> [1] "2012-07-13"</span></a> <a class="sourceLine" id="cb12-5" title="5"><span class="kw">excel_numeric_to_date</span>(<span class="fl">41103.01</span>, <span class="dt">include_time =</span> <span class="ot">TRUE</span>) <span class="co"># returns POSIXlt object</span></a> <a class="sourceLine" id="cb12-6" title="6"><span class="co">#> [1] "2012-07-13 00:14:24 EDT"</span></a> <a class="sourceLine" id="cb12-7" title="7"><span class="kw">excel_numeric_to_date</span>(<span class="fl">41103.01</span>, <span class="dt">date_system =</span> <span class="st">"mac pre-2011"</span>)</a> <a class="sourceLine" id="cb12-8" title="8"><span class="co">#> [1] "2016-07-14"</span></a></code></pre></div> <h3 id="convert-a-mix-of-date-and-datetime-formats-to-date">Convert a mix of date and datetime formats to date</h3> <p>Building on <code>excel_numeric_to_date()</code>, the new functions <code>convert_to_date()</code> and <code>convert_to_datetime()</code> are more robust to a mix of inputs. Handy when reading many spreadsheets that <em>should</em> have the same column formats, but don’t.</p> <p>For instance, here a vector with a date and an Excel datetime sees both values successfully converted to Date class:</p> <div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb13-1" title="1"><span class="kw">convert_to_date</span>(<span class="kw">c</span>(<span class="st">"2020-02-29"</span>, <span class="st">"40000.1"</span>))</a> <a class="sourceLine" id="cb13-2" title="2"><span class="co">#> [1] "2020-02-29" "2009-07-06"</span></a></code></pre></div> <h3 id="elevate-column-names-stored-in-a-dataframe-row">Elevate column names stored in a data.frame row</h3> <p>If a data.frame has the intended variable names stored in one of its rows, <code>row_to_names</code> will elevate the specified row to become the names of the data.frame and optionally (by default) remove the row in which names were stored and/or the rows above it.</p> <div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb14-1" title="1">dirt <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">X_1 =</span> <span class="kw">c</span>(<span class="ot">NA</span>, <span class="st">"ID"</span>, <span class="dv">1</span><span class="op">:</span><span class="dv">3</span>),</a> <a class="sourceLine" id="cb14-2" title="2"> <span class="dt">X_2 =</span> <span class="kw">c</span>(<span class="ot">NA</span>, <span class="st">"Value"</span>, <span class="dv">4</span><span class="op">:</span><span class="dv">6</span>))</a> <a class="sourceLine" id="cb14-3" title="3"></a> <a class="sourceLine" id="cb14-4" title="4"><span class="kw">row_to_names</span>(dirt, <span class="dv">2</span>)</a> <a class="sourceLine" id="cb14-5" title="5"><span class="co">#> ID Value</span></a> <a class="sourceLine" id="cb14-6" title="6"><span class="co">#> 3 1 4</span></a> <a class="sourceLine" id="cb14-7" title="7"><span class="co">#> 4 2 5</span></a> <a class="sourceLine" id="cb14-8" title="8"><span class="co">#> 5 3 6</span></a></code></pre></div> <h2 id="exploring-1">Exploring</h2> <h3 id="count-factor-levels-in-groups-of-high-medium-and-low-with-top_levels">Count factor levels in groups of high, medium, and low with <code>top_levels()</code></h3> <p>Originally designed for use with Likert survey data stored as factors. Returns a <code>tbl_df</code> frequency table with appropriately-named rows, grouped into head/middle/tail groups.</p> <ul> <li>Takes a user-specified size for the head/tail groups</li> <li>Automatically calculates a percent column</li> <li>Supports sorting</li> <li>Can show or hide <code>NA</code> values.</li> </ul> <!-- end list --> <div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb15-1" title="1">f <-<span class="st"> </span><span class="kw">factor</span>(<span class="kw">c</span>(<span class="st">"strongly agree"</span>, <span class="st">"agree"</span>, <span class="st">"neutral"</span>, <span class="st">"neutral"</span>, <span class="st">"disagree"</span>, <span class="st">"strongly agree"</span>),</a> <a class="sourceLine" id="cb15-2" title="2"> <span class="dt">levels =</span> <span class="kw">c</span>(<span class="st">"strongly agree"</span>, <span class="st">"agree"</span>, <span class="st">"neutral"</span>, <span class="st">"disagree"</span>, <span class="st">"strongly disagree"</span>))</a> <a class="sourceLine" id="cb15-3" title="3"><span class="kw">top_levels</span>(f)</a> <a class="sourceLine" id="cb15-4" title="4"><span class="co">#> f n percent</span></a> <a class="sourceLine" id="cb15-5" title="5"><span class="co">#> strongly agree, agree 3 0.5000000</span></a> <a class="sourceLine" id="cb15-6" title="6"><span class="co">#> neutral 2 0.3333333</span></a> <a class="sourceLine" id="cb15-7" title="7"><span class="co">#> disagree, strongly disagree 1 0.1666667</span></a> <a class="sourceLine" id="cb15-8" title="8"><span class="kw">top_levels</span>(f, <span class="dt">n =</span> <span class="dv">1</span>)</a> <a class="sourceLine" id="cb15-9" title="9"><span class="co">#> f n percent</span></a> <a class="sourceLine" id="cb15-10" title="10"><span class="co">#> strongly agree 2 0.3333333</span></a> <a class="sourceLine" id="cb15-11" title="11"><span class="co">#> agree, neutral, disagree 4 0.6666667</span></a> <a class="sourceLine" id="cb15-12" title="12"><span class="co">#> strongly disagree 0 0.0000000</span></a></code></pre></div> </body> </html>