\documentclass[a4paper]{article} \usepackage[margin=3cm]{geometry} %%\usepackage[round]{natbib} \usepackage[colorlinks=true,urlcolor=blue]{hyperref} %%\newcommand{\acronym}[1]{\textsc{#1}} %%\newcommand{\class}[1]{\mbox{\textsf{#1}}} \newcommand{\code}[1]{\mbox{\texttt{#1}}} \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} \newcommand{\proglang}[1]{\textsf{#1}} \SweaveOpts{keep.source=TRUE} %% \VignetteIndexEntry{Timings of common tasks} <>= if (!exists("data.table",.GlobalEnv)) library(data.table) # see Intro.Rnw for comments on these two lines rm(list=as.character(tables()$NAME),envir=.GlobalEnv) options(width=70) # so lines wrap round @ \begin{document} \title{Timings of common tasks using the \pkg{data.table} package in \proglang{R}} \author{Matthew Dowle} \date{Revised: \today\\(A later revision may be available on the \href{http://datatable.r-forge.r-project.org/}{homepage})} \maketitle * WORK IN PROGRESS * This document contains a series of tests, followed by a summary table of various timings and comparisons. Please go straight to the summary table first in which each row has a link back to the test. This document is reproducible. Simply run the .Rnw file yourself in your environment to confirm the results. Also see ?vignette, which says that edit(vignette("datatable-timings")) will extract the code from this document so you can easily work with it. The .Rnw included in the package has N=10,000,000. This is a small number so that 'R CMD build' completes in a reasonable time (about 5 minutes). We don't want the nightly builds on R-Forge and CRAN to slow down just to run long timing comparisons. We have increased this to N=100,000,000 ourselves, and included the output on the datatable homepage (). \tableofcontents \section{Timing tests} \subsection{Extraction} This is a repeat of the test in section 1 of the Introduction vignette. The syntax is explained there. This demonstrates the large difference in speed between vector scans and binary search. Therefore, please avoid using \code{==} in the \code{i} expression. <<>>= n = ceiling(1e7/26^2) # 10 million rows DF = data.frame(x=rep(LETTERS,each=26*n), y=rep(letters,each=n), v=rnorm(n*26^2), stringsAsFactors=FALSE) DT = as.data.table(DF) system.time(setkey(DT,x,y)) # one-off cost, usually tables() tt=system.time(ans1 <- DF[DF$x=="R" & DF$y=="h",]); tt head(ans1) dim(ans1) ss=system.time(ans2 <- DT[J("R","h")]); ss head(ans2) dim(ans2) identical(ans1$v,ans2$v) @ <>= if(!identical(ans1$v,ans2$v)) stop("Test 1 not identical") @ \subsection{Grouping} This is a repeat of the test in section 2 of the Introduction vignette. The syntax is explained there. <<>>= ttt=system.time(ans1 <- tapply(DF$v,DF$x,sum)); ttt head(ans1) sss=system.time(ans2 <- DT[,sum(v),by=x]); sss head(ans2) identical(as.vector(ans1), ans2$V1) @ <>= if(!identical(as.vector(ans1), ans2$V1)) stop("Test 2 not identical") @ \subsection{Test 3} \subsection{Test 4} \subsection{Test 5} \section{Summary table} <>= ans = matrix(c(tt[3],ss[3],ttt[3],sss[3]),byrow=TRUE,ncol=2) rownames(ans)=c("==","tapply") colnames(ans)=c("base","data.table") ans = cbind(ans,"times faster"=as.integer(ans[,1]/ans[,2])) <<>>= ans @ <>= toLatex(sessionInfo()) @ \end{document}