if (!exists("test.data.table",.GlobalEnv,inherits=FALSE)) { require(data.table) # in dev the package should not be loaded suppressWarnings({ require(reshape2) require(reshape) require(plyr) require(ggplot2) # the 2 ggplot tests take so long they get in the way in dev require(hexbin) require(nlme) require(xts) require(bit64) require(gdata) }) # The suppression is for users running test.data.table(). If they don't have ggplot2 installed we don't # want to warn about it not being installed. 'R CMD check' is a stricter test that will warn if the # packages above are not installed, via DESCRIPTION:Suggests. .devtesting=FALSE } else .devtesting=TRUE options(warn=2) nfail = ntest = lastnum = 0 whichfail = NULL compactprint = function(DT, topn=2) { cn = paste(" [Key=",paste(key(DT),collapse=",")," Types=",paste(substring(sapply(DT,class),1,3),collapse=","),"]",sep="") print(copy(DT)[,(cn):=""], topn=topn) invisible() } test = function(num,x,y,error=NULL,warning=NULL,output=NULL) { # Usage: # i) tests that x equals y when both x and y are supplied, the most common usage # ii) tests that x is TRUE when y isn't supplied # iii) if error is supplied, y should be missing and x is tested to result in an error message matching the pattern # iv) if warning is supplied, y (if supplied) is checked to equal x, and x should result in a warning message matching the pattern # v) if output is supplied, x is evaluated and printed and the output is checked to match the pattern # At most one of error, warning or output may be supplied, all single character strings (passed to grep) # num just needs to be numeric and unique. We normally increment integers at the end, but inserts can be made using decimals e.g. 10,11,11.1,11.2,12,13,... # Motivations: # 1) we'd like to know all tests that fail not just stop at the first. This often helps by revealing a common feature across a set of # failing tests # 2) test() tests more deeply than a diff on console output and uses a data.table appropriate definition of "equals" different # from all.equal and different to identical related to row.names and unused factor levels # 3) each test has a unique id which we refer to in commit messages, emails etc. # print(paste("running test #", num, sep="")) # TO DO: every line should be inside test() so we don't have to show the numbers. .lasttest <<- num ntest <<- ntest + 1 lastnum <<- num if (is.null(output)) err <<- try(x,TRUE) else out = gsub("NULL$","",paste(capture.output(print(err<<-try(x,TRUE))),collapse="")) if (!is.null(error) || !is.null(warning)) { type = ifelse(!is.null(error),"error","warning") patt = txt = ifelse(!is.null(error),error,warning) patt = gsub("[(]","[(]",patt) patt = gsub("[)]","[)]",patt) patt = gsub("\\^","\\\\^", patt) # for test 923 containing 2^31 in error message observedtype = ifelse(length(grep("converted from warning",err)), "warning", "error") if (! (inherits(err,"try-error") && length(grep(patt,err)) && type==observedtype)) { cat("Test",num,"didn't produce correct",type,":\n") cat(">",deparse(substitute(x)),"\n") cat("Expected ",type,": '",txt,"'\n",sep="") if (!inherits(err,"try-error")) cat("Observed: no error or warning\n") else cat("Observed ",observedtype,": '",gsub("^[(]converted from warning[)] ","",gsub("\n$","",gsub("^Error.* : \n ","",as.character(err)))),"'\n",sep="") nfail <<- nfail + 1 whichfail <<- c(whichfail, num) return() } if (type=="warning") err <- if (is.null(output)) x<-try(suppressWarnings(x),TRUE) else out<-paste(capture.output(x<-try(suppressWarnings(x),TRUE)),collapse="") else return() } if (inherits(err,"try-error") || (!missing(y) && inherits(err<-try(y,TRUE),"try-error"))) { cat("Test",num,err) nfail <<- nfail + 1 whichfail <<- c(whichfail, num) return() } if (!is.null(output)) { output = gsub("[[]","",output) output = gsub("[]]","",output) output = gsub("","[[]",output) output = gsub("","[]]",output) output = gsub("[(]","[(]",output) output = gsub("[)]","[)]",output) if (!length(grep(output,out))) { cat("Test",num,"didn't produce correct output:\n") cat(">",deparse(substitute(x)),"\n") cat("Expected: '",output,"'\n",sep="") cat("Observed: '",out,"'\n",sep="") nfail <<- nfail + 1 whichfail <<- c(whichfail, num) return() } if (missing(y)) return() } if (missing(y)) { if (isTRUE(as.vector(x))) return() # as.vector to drop names of a named vector such as returned by system.time cat("Test",num,"expected TRUE but observed:\n") cat(">",deparse(substitute(x)),"\n") if (is.data.table(x)) compactprint(x) else print(x) nfail <<- nfail + 1 whichfail <<- c(whichfail, num) return() } else { if (identical(x,y)) return() if (is.data.table(x) && is.data.table(y)) { # TO DO: test 166 doesn't pass with these : # if (!selfrefok(x)) stop("x selfref not ok") # if (!selfrefok(y)) stop("y selfref not ok") xc=copy(x) yc=copy(y) # so we don't affect the original data which may be used in the next test # drop unused levels in factors if (length(x)) for (i in which(sapply(x,is.factor))) {.xi=x[[i]];xc[,i:=factor(.xi),with=FALSE]} if (length(y)) for (i in which(sapply(y,is.factor))) {.yi=y[[i]];yc[,i:=factor(.yi),with=FALSE]} if (length(attr(x,"row.names"))) setattr(xc,"row.names",NULL) # for test 165+, i.e. x may have row names set from inheritance but y won't, consider these equal if (length(attr(y,"row.names"))) setattr(yc,"row.names",NULL) if (identical(xc,yc) && identical(key(x),key(y))) return() # check key too because := might have cleared it if (isTRUE(all.equal(xc,yc)) && identical(key(x),key(y))) return() } if (is.factor(x) && is.factor(y)) { x = factor(x) y = factor(y) if (identical(x,y)) return() } if (is.atomic(x) && is.atomic(y) && isTRUE(all.equal(x,y))) return() # For test 617 on r-prerel-solaris-sparc on 7 Mar 2013 } cat("Test",num,"ran without errors but failed check that x equals y:\n") cat("> x =",deparse(substitute(x)),"\n") if (is.data.table(x)) compactprint(x) else print(x) cat("> y =",deparse(substitute(y)),"\n") if (is.data.table(y)) compactprint(y) else print(y) nfail <<- nfail + 1 whichfail <<- c(whichfail, num) } .timingtests = FALSE started.at = Sys.time() if (!.devtesting) { is.sorted = data.table:::is.sorted forder = data.table:::forder null.data.table = data.table:::null.data.table ordernumtol = data.table:::ordernumtol # TO DO: deprecated, remove iradixorder = data.table:::iradixorder # TO DO: deprecated, remove dradixorder = data.table:::dradixorder # TO DO: deprecated, remove uniqlist = data.table:::uniqlist uniqlengths = data.table:::uniqlengths setrev = data.table:::setrev setreordervec = data.table:::setreordervec .R.listCopiesNamed = data.table:::.R.listCopiesNamed .R.assignNamesCopiesAll = data.table:::.R.assignNamesCopiesAll .R.subassignCopiesOthers = data.table:::.R.subassignCopiesOthers .R.subassignCopiesVecsxp = data.table:::.R.subassignCopiesVecsxp } TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7) setkey(TESTDT,a,b) # i.e. a b v # [1,] 1 5 1 # [2,] 3 5 2 # [3,] 4 6 3 # [4,] 4 6 4 # [5,] 4 9 5 # [6,] 4 9 6 # [7,] 7 2 7 INT = function(...) { as.integer(c(...)) } ########################## test(1, TESTDT[SJ(4,6),v,mult="first"], 3L) test(2, TESTDT[SJ(4,6),v,mult="last"], 4L) test(3, TESTDT[SJ(c(4,4,4),c(6,6,7)),v,mult="last",roll=TRUE], INT(4,4,4)) test(4, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="last",roll=TRUE], INT(6,6,6)) test(5, TESTDT[SJ(c(4,4,4),c(6,6,7)),v,mult="last",roll=TRUE,rollends=FALSE], INT(4,4,4)) test(6, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="last",roll=TRUE,rollends=FALSE], INT(6,6,NA)) test(7, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="first",roll=TRUE,rollends=FALSE], INT(5,5,NA)) test(8, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v]$v, INT(NA,NA,NA,NA,NA)) test(9, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v,roll=TRUE]$v, INT(NA,NA,NA,6,NA)) test(10, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v,roll=TRUE,rollends=FALSE]$v, INT(NA,NA,NA,NA,NA)) test(11, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="first"], INT(NA,NA,3,3,NA,7,NA)) test(12, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="first",roll=TRUE], INT(NA,1,3,3,6,7,7)) test(13, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last"], INT(NA,NA,6,6,NA,7,NA)) test(14, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last",roll=TRUE], INT(NA,1,6,6,6,7,7)) test(15, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last",nomatch=0], INT(6,6,7)) test(16, TESTDT[SJ(c(4)),v][[2]], INT(3,4,5,6)) #test(17, suppressWarnings(TESTDT[SJ(c(4,4)),v,mult="all",incbycols=FALSE][[1]]), INT(3:6,3:6)) test(18, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",nomatch=0][[2]], INT(3:6)) test(185, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",nomatch=NA][[2]], INT(NA,NA,3:6,NA)) test(19, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,nomatch=0][[2]], INT(1,3:6,7)) test(186, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,nomatch=NA][[2]], INT(NA,1,3:6,7)) test(20, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0][[2]], INT(1,3:6)) test(187, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA][[2]], INT(NA,1,3:6,NA)) test(21, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=0][[3]], INT(1,3:4)) test(188, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=NA][[3]], INT(NA,1,NA,3:4,NA,NA,NA)) test(22, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=0][[3]], INT(1,3:4,4,6)) test(189, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=NA][[3]], INT(NA,1,NA,3:4,4,6,NA)) test(23, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0][[3]], INT(1,3:4,4)) test(190, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA][[3]], INT(NA,1,NA,3:4,4,NA,NA)) test(24, TESTDT[SJ(c(1,NA,4,NA,NA,4,4),c(5,5,6,6,7,9,10)),v,mult="all",roll=TRUE,nomatch=0][[3]], INT(1,3:4,5:6,6)) test(191, TESTDT[SJ(c(1,NA,4,NA,NA,4,4),c(5,5,6,6,7,9,10)),v,mult="all",roll=TRUE,nomatch=NA][[3]], INT(NA,NA,NA,1,3:4,5:6,6)) # Note that the NAs get sorted to the beginning by the SJ(). # i.e. a b v (same test matrix, repeating here for easier reading of the test cases below) # [1,] 1 5 1 # [2,] 3 5 2 # [3,] 4 6 3 # [4,] 4 6 4 # [5,] 4 9 5 # [6,] 4 9 6 # [7,] 7 2 7 test(25, TESTDT[SJ(4,6),v,mult="first"], 3L) test(26, TESTDT[SJ(4,6),v,mult="last"], 4L) test(27, TESTDT[J(c(4,4,4),c(7,6,6)),v,mult="last",roll=TRUE], INT(4,4,4)) test(28, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="last",roll=TRUE], INT(6,6,6)) test(29, TESTDT[J(c(4,4,4),c(7,6,6)),v,mult="last",roll=TRUE,rollends=FALSE], INT(4,4,4)) test(30, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="last",roll=TRUE,rollends=FALSE], INT(NA,6,6)) test(31, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="first",roll=TRUE,rollends=FALSE], INT(NA,5,5)) test(32, TESTDT[J(c(8,1,4,4,-9),c(1,4,4,10,1)),v]$v, INT(NA,NA,NA,NA,NA)) test(33, TESTDT[J(c(8,1,4,4,-9),c(1,4,4,10,1)),v,roll=TRUE]$v, INT(NA,NA,NA,6,NA)) test(34, TESTDT[J(c(8,1,4,4,-9),c(1,4,7,10,1)),v,roll=TRUE,rollends=FALSE]$v, INT(NA,NA,4,NA,NA)) test(35, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="first"], INT(NA,3,NA,NA,3,7,NA)) test(36, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="first",roll=TRUE], INT(6,3,NA,7,3,7,1)) test(37, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last"], INT(NA,6,NA,NA,6,7,NA)) test(38, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last",roll=TRUE], INT(6,6,NA,7,6,7,1)) test(39, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last",nomatch=0], INT(6,6,7)) test(40, TESTDT[J(c(4)),v,mult="all"][[2]], INT(3,4,5,6)) test(41, TESTDT[J(c(4,4)),v,mult="all"][[2]], INT(3:6,3:6)) test(42, TESTDT[J(c(8,2,4,-3)),v,mult="all",nomatch=0][[2]], INT(3:6)) test(192, TESTDT[J(c(8,2,4,-3)),v,mult="all",nomatch=NA][[2]], INT(NA,NA,3:6,NA)) test(43, TESTDT[J(c(8,2,4,-3)),v,mult="all",roll=TRUE,nomatch=0][[2]], INT(7,1,3:6)) test(193, TESTDT[J(c(8,2,4,-3)),v,mult="all",roll=TRUE,nomatch=NA][[2]], INT(7,1,3:6,NA)) #test(44, suppressWarnings(TESTDT[J(c(8,4,2,-3)),v,mult="all",roll=TRUE,rollends=FALSE,incbycols=FALSE][[1]]), INT(3:6,1)) test(45, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=0][[3]], INT(1,3:4)) test(194, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=NA][[3]], INT(NA,1,NA,3:4,NA,NA,NA)) test(46, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=0][[3]], INT(1,3:4,4,6)) test(195, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=NA][[3]], INT(NA,1,NA,3:4,4,6,NA)) test(47, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0][[3]], INT(1,3:4,4)) test(196, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA][[3]], INT(NA,1,NA,3:4,4,NA,NA)) test(48, TESTDT[J(c(-9,NA,4,NA,1,4,4),c(1,5,9,6,5,9,10)),v,mult="all",roll=TRUE,nomatch=0][[3]], INT(5:6,1,5:6,6)) # this time the NAs stay where they are. Compare to test 24 above. test(197, TESTDT[J(c(-9,NA,4,NA,1,4,4),c(1,5,9,6,5,9,10)),v,mult="all",roll=TRUE,nomatch=NA][[3]], INT(NA,NA,5:6,NA,1,5:6,6)) test(49, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,nomatch=0]$v, INT(3,4,1,2,7,3,4)) test(198, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,nomatch=NA]$v, INT(3,4,1,NA,NA,2,7,NA,3,4,NA)) test(50, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,mult="last",nomatch=0], INT(4,1,2,7,4)) test(199, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,mult="last",nomatch=NA], INT(4,1,NA,NA,2,7,NA,4,NA)) TESTDT[, a:=letters[a]] setkey(TESTDT,a,b) # i.e. a b v # [1,] a 5 1 # [2,] c 5 2 # [3,] d 6 3 # [4,] d 6 4 # [5,] d 9 5 # [6,] d 9 6 # [7,] g 2 7 test(51, TESTDT[SJ(c("d","d","e","g"),c(6,7,1,2)),v,mult="all",roll=TRUE,nomatch=0][[3]], INT(3:4,4,7)) test(200, TESTDT[SJ(c("d","d","e","g"),c(6,7,1,2)),v,mult="all",roll=TRUE,nomatch=NA][[3]], INT(3:4,4,NA,7)) test(52, TESTDT[J(c("g","d","e","d"),c(6,6,1,2)),v,mult="all",roll=TRUE,nomatch=0][[3]], INT(7,3:4)) test(201, TESTDT[J(c("g","d","e","d"),c(6,6,1,2)),v,mult="all",roll=TRUE,nomatch=NA][[3]], INT(7,3:4,NA,NA)) TESTDT[, b:=letters[b]] setkey(TESTDT,a,b) # i.e. # a b v # [1,] a e 1 # [2,] c e 2 # [3,] d f 3 # [4,] d f 4 # [5,] d i 5 # [6,] d i 6 # [7,] g b 7 test(53, TESTDT[SJ(c("d","d","e","g"),c("f","g","a","b")),v,mult="last"], INT(4,NA,NA,7)) test(54, TESTDT[J(c("g","d","e","d"),c("b","g","a","f")),v,mult="last"], INT(7,NA,NA,4)) # this tests (d,g) ok even though there is an NA in last match in the roll. test(55, TESTDT[SJ(c("d","d","e","g"),c("f","g","a","b")),v,mult="first"], INT(3,NA,NA,7)) test(56, TESTDT[J(c("g","d","e","d"),c("b","g","a","f")),v,mult="first"], INT(7,NA,NA,3)) test(57, TESTDT[J(c("g","d","d","d","e","d"),c("b","g","k","b","a","f")),v,roll=TRUE][[3]], INT(7,4,6,NA,NA,3,4)) # test 58 removed. Tested this failed (rolling join on factors) pre character columns, now works. test(59, TESTDT[J(c("g","d","d","d","e","d"),c("b","g","k","b","a","f")),v,roll=TRUE,rollends=FALSE][[3]], INT(7,4,NA,NA,NA,3,4)) # test 60 removed. Tested this failed (rolling join on factors) pre character columns, now works. # Tests 61-66 were testing sortedmatch which is now replaced by chmatch for characters, and removed # for integers until needed. # Test 67 removed. No longer use factors so debate/problem avoided. # [.factor and c.factor are no longer present in data.table, not even hidden away # X = factor(letters[1:10]) # test(67, levels(X[4:6]), letters[4:6]) test(68, "TESTDT" %in% tables(silent=TRUE)[,NAME]) # NAME is returned as a column in which we look for the string test(69, "TESTDT" %in% tables(silent=TRUE)[,as.character(NAME)]) # an old test (from when NAME was factor) but no harm in keeping it a = "d" # Variable Twister. a in this scope has same name as a inside DT scope. # Aug 2010 : As a result of bug 1005, and consistency with 'j' and 'by' we now allow self joins (test 183) in 'i'. test(70, TESTDT[eval(J(a)),v], data.table(a="d",v=3:6,key="a")) # the eval() enabled you to use the 'a' in the calling scope, not 'a' in the TESTDT. TO DO: document this. test(71, TESTDT[eval(SJ(a)),v], data.table(a="d",v=3:6,key="a")) test(72, TESTDT[eval(CJ(a)),v], data.table(a="d",v=3:6,key="a")) test(73, TESTDT[,v], 1:7) test(74, TESTDT[,3], 3) test(74.5, TESTDT[,3L], 3L) test(75, TESTDT[,"v"], "v") test(76, TESTDT[,2:3], 2:3) # See ?[.data.table that with=FALSE is required for the likely intended result test(77, TESTDT[,2:3,with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) test(78, TESTDT[,c("b","v"),with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) colsVar = c("b","v") test(79, TESTDT[,colsVar], colsVar) test(80, TESTDT[,colsVar,with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) # works in test.data.table, but not eval(body(test.data.table)) when in R CMD check ... test(81, TESTDT[1:2,c(a,b)], factor(c("a","c","e","e"))) # It is expected the above to be common source of confusion. c(a,b) is evaluated within # the frame of TESTDT, and c() creates one vector, not 2 column subset as in data.frame's. # If 2 columns were required use list(a,b). c() can be useful too, but is different. test(82, TESTDT[,c("a","b")], c("a","b")) test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) # test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default test(87, TESTDT[,list(MySum=sum(v)),by="b"], data.table(b=c("e","f","i","b"),MySum=INT(3,7,11,7))) test(88, TESTDT[,list(MySum=sum(v),Sq=v*v),by="b"][1:3], data.table(b=c("e","e","f"),MySum=INT(3,3,7),Sq=INT(1,4,9))) # silent repetition of MySum to match the v*v vector # Test 89 dropped. Simplify argument no longer exists. by is now fast and always returns a data.table ... test(89, TESTDT[,sum(v),by="b",simplify=FALSE], list(7L,3L,7L,11L)) # Test 88.5 contributed by Johann Hibschman (for bug fix #1294) : test(88.5, TESTDT[a=="d",list(MySum=sum(v)),by=list(b)], data.table(b=c("f","i"), MySum=INT(7,11), key="b")) setkey(TESTDT,b) test(90, TESTDT[J(c("f","i")),sum(v),mult="all"], data.table(b=c("f","i"),V1=c(7L,11L),key="b")) # aggregation via groups passed into i and mult="all" test(90.5, TESTDT[J(c("i","f")),sum(v),mult="all"], data.table(b=c("i","f"),V1=c(11L,7L))) # test not keyed test(91, TESTDT[SJ(c("f","i")),sum(v),mult="all"], data.table(b=c("f","i"),V1=c(7L,11L),key="b")) # aggregation via groups passed into i and mult="all" # Test 92 dropped same reason as 89 ... test(TESTDT[92, J(c("f","i")),sum(v),mult="all",simplify=FALSE], list(7L,11L)) test(93, TESTDT[c("f","i"), which=TRUE], 4:7) test(94, TESTDT[c("i","f"), mult="last", which=TRUE], INT(7,5)) test(95, TESTDT["f",v]$v, 3:4) test(96, TESTDT["f",v,mult="all"], data.table(b="f",v=3:4,key="b")) test(97, TESTDT[c("f","i","b"),list(GroupSum=sum(v)),mult="all"], data.table(b=c("f","i","b"), GroupSum=c(7L,11L,7L))) # mult="all" is required here since only b is key'd # that line above doesn't create a key on the result so that the order fib is preserved. test(98, TESTDT[SJ(c("f","i","b")),list(GroupSum=sum(v)),mult="all"], data.table(b=c("b","f","i"), GroupSum=c(7L,7L,11L), key="b")) # line above is the way to group, sort by group and setkey on the result by group. dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") test(99, unique(dt), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key="A,B")) # test [<- for column assignment dt1 <- dt2 <- dt test(100, {dt1[,"A"] <- 3L; dt1}, {dt2$A <- 3L; dt2}) # test transform and within test(101, within(dt, {D <- B^2}), transform(dt, D = B^2)) test(102, within(dt, {A <- B^2}), transform(dt, A = B^2)) # test .SD object test(103, dt[, sum(.SD$B), by = "A"], dt[, sum(B), by = "A"]) test(104, dt[, transform(.SD, D = min(B)), by = "A"], dt[, list(B,C,D=min(B)), by = "A"]) # test numeric and comparison operations on a data table test(105, all(dt + dt > dt)) test(106, all(dt + dt > 1)) test(107, dt + dt, dt * 2L) # test a few other generics: test(108, dt, data.table(t(t(dt)),key="A,B")) test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key test(110, sum(is.na(dt2)), 1L) test(111, {setkey(dt,NULL);dt}, na.omit(dt)) test(112, dt2[2:nrow(dt2),A], na.omit(dt2)$A) # test [<- assignment: dt2[is.na(dt2)] <- 1L test(113, {setkey(dt,NULL);dt}, dt2) # key should be dropped because we assigned to a key column # want to discourage this going forward (inefficient to create RHS like this) # dt2[, c("A", "B")] <- dt1[, c("A", "B"), with = FALSE] # test(114, dt1, dt2) ## doesn't work, yet: ## dt2[rep(TRUE, nrow(dt)), c("A", "B")] <- dt1[, c("A", "B"), with = FALSE] ## dt2[rep(TRUE, nrow(dt)), c("A")] <- dt1[, c("A"), with = FALSE] ## test(dt, dt2)) stop("Test 112 failed") # test the alternate form of setkey: dt1 = copy(dt) dt2 = copy(dt) setkeyv(dt1, "A") setkey(dt2, A) test(115, dt1, dt2) # Test dogroups works correctly for character/factor columns test(116, TESTDT[,a[1],by="b"], data.table(b=c("b","e","f","i"), V1=c("g","a","d","d"), key="b")) test(117, TESTDT[,list(a[1],v[1]),by="b"], data.table(b=c("b","e","f","i"), V1=c("g","a","d","d"), V2=INT(7,1,3,5), key="b")) # We no longer check i for out of bounds, for consistency with data.frame. NA rows should be returned for i>nrow test(118, TESTDT[8], data.table(a=as.character(NA), b=as.character(NA), v=as.integer(NA), key="b")) test(119, TESTDT[6:9], data.table(a=c("d","d",NA,NA), b=c("i","i",NA,NA), v=c(5L,6L,NA,NA))) n=10000 grp1=sample(1:50,n,replace=TRUE) grp2=sample(1:50,n,replace=TRUE) dt=data.table(x=rnorm(n),y=rnorm(n),grp1=grp1,grp2=grp2) tt = system.time(ans <- dt[,list(.Internal(mean(x)),.Internal(mean(y))),by="grp1,grp2"]) # test(120, tt[1] < 0.5) # actually takes more like 0.068 << 0.5, but the micro EC2 instance can be slow sometimes. # TO DO: incorporate performance testing into R CMD check (using testthat?), that somehow copes with running on slow machines. i = sample(nrow(ans),1) test(121, all.equal(ans[i,c(V1,V2)], dt[grp1==ans[i,grp1] & grp2==ans[i,grp2], c(mean(x),mean(y))])) # To DO: add a data.frame aggregate method here and check data.table is faster # Tests of 0 and 1 row tables TESTDT = data.table(NULL) test(122, TESTDT[1], TESTDT) test(123, TESTDT[0], TESTDT) test(124, TESTDT[1:10], TESTDT) test(125, TESTDT["k"], error="x must be keyed") # test 126 no longer needed now that test() has 'error' argument TESTDT = data.table(a=3L,v=2L,key="a") # testing 1-row table test(127, TESTDT[J(3)], TESTDT) test(128, TESTDT[J(4)], data.table(a=4L,v=NA_integer_,key="a")) # see tests 206-207 too re the [NA] test(129, TESTDT[J(4),roll=TRUE], data.table(a=4L,v=2L,key="a")) # the i values are in the result now (which make more sense for rolling joins, the x.a can still be accessed if need be) test(130, TESTDT[J(4),roll=TRUE,rollends=FALSE], data.table(a=4L,v=NA_integer_,key="a")) test(131, TESTDT[J(-4),roll=TRUE], data.table(a=-4L,v=NA_integer_,key="a")) test(132, ncol(TESTDT[0]), 2L) test(133, TESTDT[0][J(3)], data.table(a=3L,v=NA_integer_,key="a")) # These need to retain key for consistency (edge cases of larger sorted i) # tests on data table names, make.names is now FALSE by default from v1.8.0 x = 2L; `1x` = 4L dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x) test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names=TRUE) test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "V6", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names = FALSE) test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) # the last two terms differ from data.frame() test(136, dt[,b_1, by="a.1"], data.table(a.1=1L,"b_1"=2L)) test(137, dt[,`a 1`, by="a.1"], data.table(a.1=1L,"a 1"=4L, check.names=FALSE)) test(138, dt[,a.1, by="`a 1`"], data.table(`a 1`=4L,a.1=1L, check.names=FALSE)) # tests with NA's in factors dt = data.table(a = c(NA, letters[1:5]), b = 1:6) test(139, dt[,sum(b), by="a"], data.table(a = c(NA, letters[1:5]), V1 = 1:6)) # tests to make sure rbind and grouping keep classes dt = data.table(a = rep(as.Date("2010-01-01"), 4), b = rep("a",4)) test(140, rbind(dt,dt), data.table(a = rep(as.Date("2010-01-01"), 8), b = rep("a",8))) test(141, dt[,list(a=a), by="b"], dt[,2:1, with = FALSE]) dt$a <- structure(as.integer(dt$a), class = "Date") test(142, dt[,list(b=b), by="a"], dt) dt = data.table(x=1:5,y=6:10) test(143, tail(dt), dt) # tail was failing if a column name was called x. dt <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], b2 = LETTERS[1:4]) test(144, dt[, .SD[3,], by=b], data.table(b=LETTERS[1:4],a=3L,b2=LETTERS[1:4])) DT = data.table(x=rep(c("a","b"),c(2,3)),y=1:5) xx = capture.output(ans <- DT[,{print(x);sum(y)},by=x,verbose=FALSE]) test(145, xx, c("[1] \"a\"","[1] \"b\"")) test(146, ans, data.table(x=c("a","b"),V1=c(3L,12L))) test(147, DT[,MySum=sum(v)], error="unused argument") # user meant DT[,list(MySum=sum(v))]. FR#204 done. dt = data.table(a=c(1L,4L,5L), b=1:3, key="a") test(148, dt[CJ(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L),key="a")) test(149, dt[J(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L),key="a")) # in future this will detect the subset is ordered and retain the key # 150:158 test out of order factor levels in key columns (now allowed from v1.8.0) dt = data.table(x=factor(c("c","b","a"),levels=c("b","a","c")),y=1:3) setkey(dt,x) test(150.1, dt["b",y,verbose=TRUE], output="Coercing character column i.'x' to factor") # changed i.V1 to i.x as per FR #2693 test(150.2, dt["b",y]$y, 2L) # from Tom's post : a = data.table(a=rep(1:5, 2), b=factor(letters[rep(1:5, each =2)], levels=letters[5:1]), key="b") test(151.1, a[J("b"),a,verbose=TRUE], output="Coercing character column i.'b' to factor") # changed `i.V1` to `i.b` following changes for FR #2693 test(151.2, a[J("b"),a]$a, 3:4) # stretch tests further, two out of order levels, one gets key'd the other not : a = data.table(x=factor(letters[rep(1:5, each =2)], levels=letters[5:1]), y=factor(letters[rep(c(6,9,7,10,8), each =2)], levels=letters[10:6]), z=1:10) test(152, is.sorted(levels(a$x)), FALSE) test(153, is.sorted(levels(a$y)), FALSE) test(154, a[,sum(z),by=x][1,paste(x,V1)], "a 3") # ad hoc by doesn't sort the groups so 'a' (5th level) should be first setkey(a,x) # 'e' (level 1) should come first now. test(155, is.sorted(levels(a$x)), FALSE) test(156, is.sorted(levels(a$y)), FALSE) test(157, a[,sum(z),by=x][1,paste(x,V1)], "e 19") # 1st level is now first test(158, a[,sum(z),by=y][1,paste(y,V1)], "h 19") # not 'f' test(158.5, a[,sum(z),keyby=y][1,paste(y,V1)], "j 15") # not 'f' either # tests of by expression variables DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 ) f = quote( list(d) ) test(159, DT[,mean(b),by=eval(f)], DT[,mean(b),by=list(d)]) # column f doesn't get in the way of expression f foo = function( grp ) { DT[,mean(b),by=eval(grp)] } test(160, foo(quote(list(d))), DT[,mean(b),by=list(d)]) test(161, foo(quote(list(d,a))), DT[,mean(b),by=list(d,a)]) test(162, foo(quote(list(f))), DT[,mean(b),by=list(f)]) test(163, foo(quote(list(grp))), DT[,mean(b),by=list(grp)]) # grp local variable in foo doesn't conflict with column grp test(164, foo(f), DT[,mean(b),by=d]) # checks that data.table inherits methods from data.frame in base ok test(165, subset(DT,a>2), DT[a>2]) test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2]) if ("package:ggplot2" %in% search()) { test(167,names(print(ggplot(DT,aes(b,f))+geom_point())),c("data","panel","plot")) # The names() is a stronger test that it has actually plotted, but also because test() sees the invisible result test(167.1,DT[,print(ggplot(.SD,aes(b,f))+geom_point()),by=list(grp%%2L)],data.table(grp=integer())) # %%2 because there are 5 groups in DT data at this stage, just need 2 to test # New test reported by C Neff on 11 Oct 2011 if ("package:hexbin" %in% search()) test(167.2, names(print(ggplot(DT) + geom_hex(aes(b, f)) + facet_wrap(~grp))), c("data","panel","plot")) else cat("Test 167.2 not run. If required call library(hexbin) first.\n") # Test plotting ITime with ggplot2 which seems to require an as.data.frame method for ITime, #1713 datetimes = c("2011 NOV18 09:29:16", "2011 NOV18 10:42:40", "2011 NOV18 23:47:12", "2011 NOV19 01:06:01", "2011 NOV19 11:35:34", "2011 NOV19 11:51:09") DT = IDateTime(strptime(datetimes,"%Y %b%d %H:%M:%S")) test(168, print(DT[,qplot(idate,itime)])$ranges, print(qplot(DT$idate,DT$itime))$ranges) test(168.1, print(DT[,qplot(idate,as.POSIXct(itime,tzone=""))])$ranges, print(qplot(idate,as.POSIXct(itime,tzone=""),data=DT))$ranges) try(graphics.off(),silent=TRUE) #try(graphics.off(),silent=TRUE) # R CMD check doesn't like graphics it seems, even when inside try() } else { cat("Tests 167-168 not run. If required call library(ggplot2) first.\n") # ggplot takes a long time so we don't include these by default # From examples, the library(ggplot2) is done first, so that 'R CMD check' does include tests 167-168 } # test of . in formula, using inheritance DT = data.table(y=1:100,x=101:200,y=201:300,grp=1:5) test(169,DT[,as.list(lm(y~0+.,.SD)$coef),by=grp][2,x]-2<1e-10, TRUE) DT <- data.table( a=1:4, d=c("A","B","C","D") ) g <- quote( list( d ) ) test(170, DT[,list(d)], DT[,eval(g)]) DT = data.table(A=c(25L,85L,25L,25L,85L), B=c("a","a","b","c","c"), C=c(2,65,9,82,823)) test(171.1, DT[B=="b"][A==85], output="Empty data.table (0 rows) of 3 cols: A,B,C") test(171.2, DT[B=="b"][A==85,C], numeric()) test(171.3, DT[ , data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ], by=B ], data.table(B=c("a","c"),V1=c(67,905))) test(172, DT[ , list(3,data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ]), by=B ], data.table(B=c("a","b","c"),V1=3,V2=c(67,NA,905))) # Test growing result in memory. Usually the guess is good though. # This example returns no rows for first group so guess for up-front allocate needs a reallocate DT = data.table(A=c(1L,1L,2L,2L,3L,3L), B=1:6) test(173, DT[,B[B>3],by=A][,V1], c(4L,5L,6L)) # Example taken from Harish post to datatable-help on 11 July DT <- data.table( A=c("a","a","b","b","d","c","a","d"), B=c("x1","x2","x2","x1","x2","x1","x1","x2"), C=c(5,2,3,4,9,5,1,9) ) test(174, DT[,C[C-min(C)<3],by=list(A,B)][,V1], c(1,2,3,4,9,9,5)) test(175, DT[,C[C-min(C)<5],by=list(A,B)][,V1], c(5,1,2,3,4,9,9,5)) # Tests of data.table sub-assignments: $<-.data.table & [<-.data.table DT = data.table(a = c("A", "Z"), b = 1:10, key = "a") DT[J("A"),2] <- 100L # without L generates nice warning :-) DT[J("A"),"b"] <- 1:5 DT[1:3,"b"] <- 33L test(176, DT, data.table(a = rep(c("A", "Z"), each = 5), b = as.integer(c(rep(33, 3), 4:5, seq(2, 10, by = 2))), key = "a")) DT[J("A"),"a"] <- "Z" test(177, DT, data.table(a="Z", b=as.integer(c(rep(33, 3), 4:5, seq(2, 10, by = 2))))) # i.e. key dropped and column a still factor DT <- data.table(a = c("A", "Z"), b = 1:10, key = "a") DT$b[1:5] <- 1:5 DT$b[1:3] <- 33 test(178, DT, data.table(a = rep(c("A", "Z"), each = 5), b = c(rep(33, 3), 4:5, seq(2, 10, by = 2)), key = "a")) DT$a <- 10:1 test(179, key(DT), NULL ) # Test logical in a key DT = data.table(a=rep(1:3,each=2),b=c(TRUE,FALSE),v=1:6) setkey(DT,a,b) test(180, DT[J(2,FALSE),v]$v, 4L) test(181, DT[,sum(v),by=b][,V1], c(12L,9L)) # Test fix for bug 1026 reported by Harish V # this test needed a unique var name to generate error 'object 'b' not found'. # Otherwise it finds 'b' in local scope. setnames(DT,2,"buniquename314") bar = function( data, fcn ) { q = substitute( fcn ) xx = data[,eval(q),by=a] yy = data[,eval(substitute(fcn)),by=a] identical(xx,yy) } test(182, bar( DT, sum(buniquename314) ), TRUE) # Test bug 1005 reported by Branson Owen DT = data.table(A = c("o", "x"), B = 1:10, key = "A") test(183, DT[J(unique(A)), B]$B, DT$B) # Test bug 709 which returned an error here. And return type now empty table, #1945 in 1.8.1. xx = data.table(a=1:5,b=6:10) test(184, xx[a>6,sum(b),by=a], data.table(a=integer(),V1=integer())) # Tests of bug 1015 highlight by Harish # See thread "'by without by' now heeds nomatch=NA" # Tests 185-201 were added in above next to originals x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key="a,b") y <- data.table(g=c("a","b","c","d"),h=c("A","A","A","A")) test(202, x[y], x[y,mult="all"]) test(203, x[y,d]$d, c(1,2,NA,NA)) test(204, x[y,list(d)], x[y,d]) test(205, x[y,list(d),mult="all"][,d], c(1,2,NA,NA)) # Test [NA] returns one NA row. NA is type *logical* so prior to # change in v1.5, NA would get silently recycled and the whole table would # be returned all NA (rarely useful and often confusing, but consistent # with data.frame). TESTDT = data.table(a=1:3,v=1:3,key="a") test(206, TESTDT[NA], data.table(a=NA_integer_,v=NA_integer_,key="a")) # NA are now allowed in keys, so retains key setkey(TESTDT,NULL) test(207, TESTDT[NA], data.table(a=NA_integer_,v=NA_integer_)) # With inheritance, NROW and NCOL in base work nicely. No need for them in data.table. test(208, NROW(TESTDT), 3L) test(209, nrow(TESTDT), 3L) test(210, NCOL(TESTDT), 2L) test(211, ncol(TESTDT), 2L) # Test infinite recursion error is trapped when a pre-1.5 data.table # is used with 1.5 (bug #1008) DT = data.table(a=1:6,key="a") test(212, DT[J(3)]$a, 3L) # correct class c("data.table","data.frame") class(DT) = "data.table" # incorrect class, but as from 1.8.1 it works. By accident when moving from colnames() to names(), it was dimnames() doing the check, but rather than add a check that identical(class(DT),c("data.frame","data.table")) at the top of [.data.table, we'll leave it flexible to user (user might not want to inherit from data.frame for some reason). test(213, DT[J(3)]$a, 3L) # setkey now auto coerces double and character for convenience, and # to solve bug #953 DF = data.frame(a=LETTERS[1:10], b=1:10, stringsAsFactors=FALSE) DT = data.table(DF) setkey(DT,a) # used to complain about character test(215, DT["C",b]$b, 3L) DT = data.table(DF,key="a") test(216, DT["C",b]$b, 3L) DT = data.table(a=c(1,2,3),v=1:3,key="a") test(217, DT[J(2),v]$v, 2L) DT = data.table(a=c(1,2.1,3),v=1:3,key="a") test(218, DT[J(2.1),v]$v, 2L) # tests of quote()-ed expressions in i. Bug #1058 DT = data.table(a=1:5,b=6:10,key="a") q = quote(a>3) test(220, DT[eval(q),b], 9:10) test(221, DT[eval(parse(text="a>4")),b], 10L) test(222, DT[eval(parse(text="J(2)")),b]$b, 7L) # lists in calling scope should be ok as single names passed to by, bug #1060 DT = data.table(a=1:2,b=rnorm(10)) byfact = DT[,a] # vector, ok before fix but check anyway test(223, DT[,mean(b),by=byfact], DT[,mean(b),by=list(byfact)]) byfact = DT[,list(a)] # this caused next line to fail before fix test(224, DT[,mean(b),by=byfact], DT[,mean(b),by=as.list(byfact)]) test(225, DT[,mean(b),by=byfact], DT[,mean(b),by={byfact}]) # tests for building expressions via parse, bug #1243 dt1key<-data.table(A1=1:100,onekey=rep(1:2,each=50)) setkey(dt1key,onekey) ASumExpr<-parse(text="quote(sum(A1))") # no need for quote but we test it anyway because that was work around when test 227 failed ASumExprNoQ<-parse(text="sum(A1)") ans = dt1key[,sum(A1),by=onekey] test(226,ans,dt1key[,eval(eval(ASumExpr)),by=onekey]) test(227,ans,dt1key[,eval(ASumExprNoQ),by=onekey]) # test for uncommon grouping pattern on 1-row data.table, bug #1245 DT = data.table(a=1L,b=2L) test(228,DT[,list(1:2),by=a],data.table(a=c(1L,1L),V1=1:2)) # special case j=.SD, bug #1247 DT = data.table(a=rep(1:2,each=2),b=1:4) test(229,DT[,.SD,by=a],DT) setkey(DT,a) test(229.1,DT[,.SD,by=key(DT)],DT) # merge bug with column 'x', bug #1229 d1 <- data.table(x=c(1,3,8),y1=rnorm(3), key="x") d2 <- data.table(x=c(3,8,10),y2=rnorm(3), key="x") ans1=merge(d1, d2, by="x") ans2=cbind(d1[2:3],y2=d2[1:2]$y2);setkey(ans2,x) test(230, ans1, ans2) # one column merge, bug #1241 DT = data.table(a=rep(1:2,each=3),b=1:6,key="a") y = data.table(a=c(0,1),bb=c(10,11),key="a") test(231,merge(y,DT),data.table(a=1L,bb=11,b=1:3,key="a")) test(232,merge(y,DT,all=TRUE),data.table(a=rep(c(0L,1L,2L),c(1,3,3)),bb=rep(c(10,11,NA_real_),c(1,3,3)),b=c(NA_integer_,1:6),key="a")) y = data.table(a=c(0,1),key="a") # y with only a key column test(233,merge(y,DT),data.table(a=1L,b=1:3,key="a")) test(234,merge(y,DT,all=TRUE),data.table(a=rep(c(0L,1L,2L),c(1,3,3)),b=c(NA_integer_,1:6),key="a")) # 'by' when DT contains list columns DT = data.table(a=c(1,1,2,3,3),key="a") DT$b=list(1:2,1:3,1:4,1:5,1:6) test(235,DT[,mean(unlist(b)),by=a],data.table(a=c(1,2,3),V1=c(1.8,2.5,mean(c(1:5,1:6))),key="a")) test(236,DT[,sapply(b,mean),by=a],data.table(a=c(1,1,2,3,3),V1=c(1.5,2.0,2.5,3.0,3.5),key="a")) # when i is a single name, it no longer evaluates within data.table scope DT = data.table(a=1:5,b=rnorm(5),key="a") a = list(4) test(237,DT[a],DT[J(4)]) # repeat earlier test with xkey instead of x. xkey is internal to merge; the bigger problem Tom mentioned. d1 <- data.table(xkey=c(1,3,8),y1=rnorm(3), key="xkey") d2 <- data.table(xkey=c(3,8,10),y2=rnorm(3), key="xkey") ans2=cbind(d1[2:3],y2=d2[1:2]$y2);setkey(ans2,xkey) test(238, merge(d1, d2, by="xkey"), ans2) # Join Inherited Scope, and X[Y] including Y's non-join columns X=data.table(a=rep(1:3,c(3,3,2)),foo=1:8,key="a") Y=data.table(a=2:3,bar=6:7) test(239, X[Y,sum(foo)], data.table(a=2:3,V1=c(15L,15L),key="a")) test(240, X[Y,sum(foo*bar)], data.table(a=2:3,V1=c(90L,105L),key="a")) test(241, X[Y], data.table(a=rep(2:3,3:2),foo=4:8,bar=rep(6:7,3:2),key="a")) test(242, X[Y,list(foo,bar)][,sum(foo*bar)], 195L) test(243, X[Y][,sum(foo*bar)], 195L) # not sure about these yet : # test(244, X[Y,sum(foo*bar),mult="first"], data.table(a=2:3,V1=c(24L,49L))) # test(245, X[Y,sum(foo*bar),mult="last"], data.table(a=2:3,V1=c(36L,56L))) # joining to less than all X's key colums (in examples but can't see formal test) X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key="a,b") test(246, X["A"], {tt=X[1:2];setkeyv(tt,key(X));tt}) # key will be retained in future test(247, X["C"]$v, NA_integer_) test(248, nrow(X["C",nomatch=0]), 0L) x=data.table( a=c("a","b","c"), b=1:3, key="a" ) y=data.table( a=c("b","d","e"), d=c(8,9,10) ) test(249, x[y], data.table(a=c("b","d","e"),b=c(2L,NA,NA),d=c(8,9,10),key="a")) # keeps i join cols test(250, x[y,mult="first"], data.table(a=c("b","d","e"),b=c(2L,NA,NA),d=c(8,9,10),key="a")) # same x=data.table( a=c("a","b","b","c"), b=1:4, key="a" ) y=data.table(a=c("b","d","b"), d=c(8,9,10)) test(251, x[y, allow.cartesian=TRUE], data.table(a=c("b","b","d","b","b"),b=c(2:3,NA,2:3),d=c(8,8,9,10,10))) # auto coerce float to int in ad hoc by (just like setkey), FR#1051 DT = data.table(a=INT(1,1,1,2,2),v=1:5) test(252, DT[,sum(v),by=a], data.table(a=1:2,V1=c(6L,9L))) # check that by retains factor columns, since character is now default DT = data.table(a=factor(c("A","A","A","B","B")),v=1:5) test(253, DT[,sum(v),by=a], data.table(a=factor(c("A","B")),V1=c(6L,9L))) # fix for bug #1298 with by=key(DT) and divisibility error. DT=data.table(a=c(1,1,1,2,2),b=1:5,key="a") test(254, DT[,sum(b),by=key(DT)]$V1, c(6L,9L)) # for for bug #1294 (combining scanning i and by) # also see test 88.5 contributed by Johann Hibschman above. DT = data.table(a=1:12,b=1:2,c=1:4) test(255, DT[a>5,sum(c),by=b]$V1, c(12L, 7L)) # fix for bug #1301 (all.vars() doesn't appear to find fn in fns[[fn]] usage) DT = data.table(a=1:6,b=1:2,c=letters[1:2],d=1:6) fns = list(a=max,b=min) test(256, DT[,fns[[b[1]]](d),by=c]$V1, c(5L,2L)) test(257, DT[,fns[[c[1]]](d),by=c]$V1, c(5L,2L)) fns=c(max,min) DT = data.table(ID=1:10, SCORE_1=1:10, SCORE_2=11:20, SCORE_3=30:21, fn=c(rep(1, 5), rep(2, 5))) test(258, DT[,fns[[fn]](SCORE_1,SCORE_2,SCORE_3),by=ID]$V1, c(30:26,6:10)) test(259, DT[,as.list(fns[[fn]](SCORE_1,SCORE_2,SCORE_3)),by=ID]$V1, c(30:26,6:10)) test(260, DT[,list(fns[[fn]](SCORE_1,SCORE_2,SCORE_3)),by=ID]$V1, c(30:26,6:10)) # fix for bug #1340 - Duplicate column names in self-joins (but print ok) DT <- data.table(id=1:4, x1=c("a","a","b","c"), x2=c(1L,2L,3L,3L), key="x1") test(261, DT[DT, allow.cartesian=TRUE][id < id.1]$x2.1, 2L) # "<-" within j now assigns in the same environment for 1st group, as the rest # Thanks to Andeas Borg for highlighting on 11 May dt <- data.table(x=c(0,0,1,0,1,1), y=c(0,1,0,1,0,1), z=1:6) groupInd = 0 test(262, dt[,list(z,groupInd<-groupInd+1),by=list(x,y)]$V2, c(1,2,2,3,3,4)) test(263, groupInd, 0) test(264, dt[,list(z,groupInd<<-groupInd+1),by=list(x,y)]$V2, c(1,2,2,3,3,4)) test(265, groupInd, 4) # Tests for passing 'by' expressions that evaluate to character column # names in the edge case of 1 row; the character 'by' vector could # feasibly be intended to be grouping values. Bug 1404; thanks to Andreas Borg # for the detailed report, suggested fix and tests. DT = data.frame(x=1,y="a",stringsAsFactors=FALSE) DT = as.data.table(DT) test(266,class(DT$y),"character") # just to check we setup the test correctly test(267,DT[,sum(x),by=y]$V1,1) test(268,DT[,sum(x),by="y"]$V1,1) colvars="y" test(269,DT[,sum(x),by=colvars]$V1,1) setkey(DT,y) test(270,DT[,sum(x),by=key(DT)]$V1,1) DT = data.table(x=1,y=2) setkeyv(DT,names(DT)) test(271, DT[,length(x),by=key(DT)]$V1, 1L) DT = data.table(x=c(1,2,1), y=c(2,3,2), z=1:3) setkeyv(DT,names(DT)) test(272, DT[,sum(z),by=key(DT)]$V1, c(1L,3L,2L)) # Tests for .BY and implicit .BY # .BY is a single row, and by variables are now, too. FAQ 2.10 has been changed accordingly. DT = data.table(a=1:6,b=1:2) test(273, DT[,sum(a)*b,by=b]$V1, c(9L,24L)) test(274, DT[,sum(a)*.BY[[1]],by=b], data.table(b=1:2,V1=c(9L,24L))) test(275, DT[,sum(a)*bcalc,by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(18L,36L))) test(276, DT[,sapply(.SD,sum)*b,by=b], data.table(b=1:2,V1=c(9L,24L))) # .SD should no longer include b, unlike v1.6 and before test(277, DT[,sapply(.SD,sum)*bcalc,by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(18L,36L))) # cols used in by expressions are excluded from .SD, but can still be used in j (by name only and may vary within the group e.g. DT[,max(diff(date)),by=month(date)] test(278, DT[,sum(a*b),by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(9L,24L))) # Test x==y where either column contain NA. DT = data.table(x=c(1,2,NA,3,4),y=c(0,2,3,NA,4),z=1:5) test(279, DT[x==y,sum(z)], 7L) # In data.frame the equivalent is : # > DF = as.data.frame(DT) # > DF[DF$x==DF$y,] # x y z # 2 2 2 2 # NA NA NA NA # NA.1 NA NA NA # 5 4 4 5 # > DF[!is.na(DF$x) & !is.na(DF$y) & DF$x==DF$y,] # x y z # 2 2 2 2 # 5 4 4 5 # Test that 0 length columns are expanded with NA to match non-0 length columns, bug fix #1431 DT = data.table(pool = c(1L, 1L, 2L), bal = c(10, 20, 30)) test(280, DT[, list(bal[0], bal[1]), by=pool], data.table(pool=1:2, V1=NA_real_, V2=c(10,30))) test(281, DT[, list(bal[1], bal[0]), by=pool], data.table(pool=1:2, V1=c(10,30), V2=NA_real_)) # Test 2nd group too (the 1st is special) ... test(282, DT[, list(bal[ifelse(pool==1,1,0)], bal[1]), by=pool], data.table(pool=1:2, V1=c(10,NA), V2=c(10,30))) # More tests based on Andreas Borg's post of 11 May 2011. DT = data.table(x=INT(0,0,1,0,1,1), y=INT(1,1,0,1,1,1), z=1:6) ans = data.table(x=c(0L,1L,1L),y=c(1L,0L,1L),V1=c(1L,1L,2L),V2=c(7L,3L,11L)) test(283, DT[,list(sum(x[1], y[1]),sum(z)), by=list(x,y)], ans) test(284, DT[,list(sum(unlist(.BY)),sum(z)),by=list(x,y)], ans) groupCols = c("x", "y") test(285, DT[,list(sum(unlist(.BY)),sum(z)),by=groupCols], ans) groupExpr = quote(list(x,y)) test(286, DT[,list(sum(unlist(.BY)),sum(z)),by=groupExpr], ans) # Bug fix from Damian B on 25 June 2011 : DT = data.table(X=c(NA,1,2,3), Y=c(NA,2,1,3)) setkeyv(DT,c("X","Y")) test(287, unique(DT), DT) # Bug fix #1421: using vars in calling scope in j when i is logical or integer. DT = data.table(A=c("a","b","b"),B=c(4,5,NA)) myvar = 6 test(288, DT[A=="b",B*myvar], c(30,NA)) # Test new feature in 1.6.1 that i can be plain list (such as .BY) DT = data.table(grp=c("a","a","a","a","b","b","b"),v=1:7) mysinglelookup = data.table(grp=c("a","b"),s=c(42,84),grpname=c("California","New York"),key="grp") setkey(mysinglelookup,grp) test(289, DT[,sum(v*mysinglelookup[.BY]$s),by=grp], data.table(grp=c("a","b"),V1=c(420,1512))) # In v1.6.2 we will change so that single name j returns a vector, regardless of grouping test(290, DT[,list(mysinglelookup[.BY]$grpname,sum(v)),by=grp], data.table(grp=c("a","b"),V1=c("California","New York"),V2=c(10L,18L))) # Test user defined attributes are retained, see comment in FR#1006 DT = data.table(a=as.numeric(1:2),b=3:4) setattr(DT,"myuserattr",42) setkey(DT,a) # a is numeric so a change of type to integer occurs, too, via := which checks selfref is ok test(291, attr(DT,"myuserattr"), 42) # Test new .N symbol DT = data.table(a=INT(1,1,1,1,2,2,2),b=INT(3,3,3,4,4,4,4)) test(292, DT[,.N,by=list(a,b)], data.table(a=c(1L,1L,2L),b=c(3L,4L,4L),N=c(3L,1L,3L))) test(293, DT[,list(a+b,.N),by=list(a,b)], data.table(a=c(1L,1L,2L),b=c(3L,4L,4L),V1=4:6,N=c(3L,1L,3L))) # Test that setkey and := syntax really are by reference, even within functions. You # really do need to take a copy first to a new name; force(x) isn't enough. DT = data.table(a=1:3,b=4:6) f = function(x){ force(x) setkey(x) } f(DT) test(294,key(DT),c("a","b")) # The setkey didn't copy to a local variable. Need to copy first to local variable (with a new name) if required. f = function(x){ force(x) x[,a:=42L] } f(DT) test(295,DT,data.table(a=42L,b=4:6)) # := was by reference (fast) and dropped the key, too, because assigned to key column DT = data.table(a=1:3,b=4:6) f = function(x){ x = copy(x) setkey(x) } f(DT) test(295.1,key(DT),NULL) setkey(DT,a) f = function(x){ x = copy(x) x[,b:=10:12][J(2),b][[2]] } # test copy retains key test(295.2,f(DT),11L) test(295.3,DT,data.table(a=1:3,b=4:6,key="a")) # The := was on the local copy # new feature added 1.6.3, that key can be vector. test(296,data.table(a=1:3,b=4:6,key="a,b"),data.table(a=1:3,b=4:6,key=c("a","b"))) # test .SDcols (not speed, just operation) DT = data.table(grp=1:3,A1=1:9,A2=10:18,A3=19:27,B1=101:109,B2=110:118,B3=119:127,key="grp") test(297,DT[,list(A1=sum(A1),A2=sum(A2),A3=sum(A3)),by=grp], DT[,lapply(.SD,sum),by=grp,.SDcols=2:4]) DT = data.table(a=1:3,b=4:6) test(298, {DT$b<-NULL;DT}, data.table(a=1:3)) # delete column test(299, DT$c <- as.character(DT$c), error="zero length") # to simulate RHS which could (due to user error) be non NULL but zero length. This copies DT too, so the next test checks that a subsequent := detects and fixes that. test(299.1, DT[,c:=42L], data.table(a=1:3,c=42L), warning="Invalid .internal.selfref detected and fixed") test(299.2, truelength(DT)>length(DT)) # the := over-allocated, by 100 by default, but user may have changed default so just check '>' # FR #2551 - old 299.3 and 299.5 are changed to include length(RHS) > 1 to issue the warning test(299.3, DT[2:3,c:=c(42, 42)], data.table(a=1:3,c=42L), warning="Coerced 'double' RHS to 'integer' to match the column's type.*length 3 (nrows of entire table)") # FR #2551 - length(RHS) = 1 - no warning for type conversion test(299.7, DT[2,c:=42], data.table(a=1:3,c=42L)) # also see tests 302 and 303. (Ok, new test file for fast assign would be tidier). test(299.4, DT[,c:=rep(FALSE,nrow(DT))], data.table(a=1:3,c=FALSE)) # replace c column with logical test(299.5, DT[2:3,c:=c(42,0)], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced 'double' RHS to 'logical' to match the column's type.*length 3 (nrows of entire table)") # FR #2551 - length(RHS) = 1 - no warning for type conversion test(299.8, DT[2,c:=42], data.table(a=1:3,c=c(FALSE,TRUE,FALSE))) test(299.9, DT[2,c:=42L], data.table(a=1:3,c=c(FALSE,TRUE,FALSE))) test(299.6, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=FALSE), warning="Coerced 'integer' RHS to 'logical' to match the column's type.*length 3 (nrows of entire table)") # Test bug fix #1468, combining i and by. DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L),key="b")) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) # Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) # This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which # this tests. But it's well under 10 seconds. DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) test(301, nrow(DT[,sum(B),by=C])==100010) DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) test(301.1, nrow(DT[,sum(B),by=C])==100010) # Test fast assign DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a") DT[2,b:=42L] # needs to be on its own line to test DT symbol is changed by reference test(302, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,42L,6L,7L),key="a")) DT[J(2),b:=84L] test(303, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,84L,84L,7L),key="a")) # Test key is dropped when non-dt-aware packages reorder rows of data.table (for example) if ("package:plyr" %in% search()) { DT = data.table(a=1:10,b=1:2,key="a") test(304, arrange(DT,b), data.table(a=INT(1,3,5,7,9,2,4,6,8,10),b=INT(1,1,1,1,1,2,2,2,2,2))) # testing no key here, too } else { cat("Test 304 not run. If required call library(plyr) first.\n") } # Test that changing colnames keep key in sync. # TO DO: will have to do this for secondary keys, too, when implemented. DT = data.table(x=1:10,y=1:10,key="x") setnames(DT,c("a","b")) test(305, key(DT), "a") setnames(DT,"a","R") test(306, key(DT), "R") setnames(DT,"b","S") test(307, key(DT), "R") setnames(DT,c("a","b")) test(308, key(DT), "a") setnames(DT,1,"R") test(309, key(DT), "R") # Test :=NULL DT = data.table(x=1:5,y=6:10,z=11:15,key="y") test(310, DT[,x:=NULL], data.table(y=6:10,z=11:15,key="y")) # delete first test(311, DT[,y:=NULL], data.table(z=11:15)) # deleting key column also removes key test(312, DT[,z:=NULL], data.table(NULL)) # deleting all test(313, DT[,a:=1:3], error="") # cannot := a new column to NULL data.table, currently. Must use data.table() DT = data.table(a=20:22) test(314, {DT[,b:=23:25];DT[,c:=26:28]}, data.table(a=20:22,b=23:25,c=26:28)) # add in series test(315, DT[,c:=NULL], data.table(a=20:22,b=23:25)) # delete last test(316, DT[,c:=NULL], data.table(a=20:22,b=23:25), warning="Adding new column 'c' then assigning NULL") # Test adding, removing and updating columns via [<- in one step DT = data.table(a=1:6,b=1:6,c=1:6) DT[,c("a","c","d","e")] <- list(NULL,11:16,42L,21:26) test(317, DT, data.table(b=1:6,c=11:16,d=42L,e=21:26)) # Other assignments (covers DT[x==2, y:=5] too, #1502) DT[e<24,"b"] <- 99L test(318, DT, data.table(b=c(99L,99L,99L,4L,5L,6L),c=11:16,d=42L,e=21:26)) test(319, DT[b!=99L,b:=99L], data.table(b=99L,c=11:16,d=42L,e=21:26)) # previous within functionality restored, #1498 DT = data.table(a=1:10) test(320, within(DT, {b <- 1:10; c <- a + b})[,list(a,b,c)], data.table(a=1:10,b=1:10,c=as.integer(seq(2,20,length=10)))) # not sure why within makes columns in order a,c,b, but it seems to be a data.frame thing, too. test(321, transform(DT,b=42L,e=a), data.table(a=1:10,b=42L,e=1:10)) DT = data.table(a=1:5, b=1:5) test(322, within(DT, rm(b)), data.table(a=1:5)) # check that cbind dispatches on first argument as expected test(323, cbind(DT,DT), data.table(a=1:5,b=1:5,a=1:5,b=1:5)) # no check.names as from v1.8.0 (now we have :=, cbind is used far less anyway) test(324, cbind(DT,data.frame(c=1:5)), data.table(a=1:5,b=1:5,c=1:5)) test(325, rbind(DT,DT), data.table(a=c(1:5,1:5),b=1:5)) test(326, rbind(DT,data.frame(a=6:10,b=6:10)), data.table(a=1:10,b=1:10)) # test removing multiple columns, and non-existing ones, #1510 DT = data.table(a=1:5, b=6:10, c=11:15) test(327, within(DT,rm(a,b)), data.table(c=11:15)) test(328, within(DT,rm(b,c)), data.table(a=1:5)) test(329, within(DT,rm(b,a)), data.table(c=11:15)) test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning="object 'd' not found") DT[,c("b","a")]=NULL test(332, DT, data.table(c=11:15)) test(333, within(DT,rm(c)), data.table(NULL)) DT = data.table(a=1:5, b=6:10, c=11:15) DT[,2:1]=NULL test(334, DT, data.table(c=11:15)) test(335, DT[,2:1]<-NULL, error="Attempt to assign to column") DT = data.table(a=1:2, b=1:6) test(336, DT[,z:=a/b], data.table(a=1:2,b=1:6,z=(1:2)/(1:6))) test(337, DT[3:4,z:=a*b], data.table(a=1:2,b=1:6,z=c(1,1,3,8,1/5,2/6))) # test LHS of := when with=FALSE DT = data.table(a=1:3, b=4:6) test(338, DT[,2:=42L,with=FALSE], data.table(a=1:3,b=42L)) test(339, DT[,2:1:=list(10:12,3L),with=FALSE], data.table(a=3L,b=10:12)) test(340, DT[,"a":=7:9,with=FALSE], data.table(a=7:9,b=10:12)) test(341, DT[,c("a","b"):=1:3,with=FALSE], data.table(a=1:3,b=1:3)) mycols = "a" test(342, DT[,mycols:=NULL,with=FALSE], data.table(b=1:3)) mynewcol = "newname" test(343, DT[,mynewcol:=21L,with=FALSE], data.table(b=1:3,newname=21L)) mycols = 1:2 test(344, DT[,mycols:=NULL,with=FALSE], data.table(NULL)) # It seems that the .Internal rbind of two data.frame coerces IDate to numeric. Tried defining # "[<-.IDate" as per Tom's suggestion, and c.IDate to no avail (maybe because the .Internal code # in bind.c doesn't look up package methods?). Anyway, as from 1.8.1, double are allowed in keys, so # these still work but for a different reason than before 1.8.1: the results are IDate stored as double, # rather than before when is worked because by and setkey coerced double to integer. DF = data.frame(x=as.IDate(c("2010-01-01","2010-01-02")), y=1:6) DT = as.data.table(rbind(DF,DF)) test(345, DT[,sum(y),by=x], {.x=as.IDate(c("2010-01-01","2010-01-02"));mode(.x)="double";data.table(x=.x,V1=c(18L,24L))}) test(346, setkey(DT,x)[J(as.IDate("2010-01-02"))], {.x=as.IDate(rep("2010-01-02",6L));mode(.x)="double";data.table(x=.x,y=rep(c(2L,4L,6L),2),key="x")}) # Test that invalid keys are reset, without user needing to remove key using key(DT)=NULL first DT = data.table(a=letters[1:3],b=letters[6:4],key="a") attr(DT,"sorted")="b" # user can go under the hood test(347, setkey(DT,b), data.table(a=letters[3:1],b=letters[4:6],key="b"), warning="Already keyed by this key but had invalid row order, key rebuilt") # Test .N==0 with nomatch=NA|0 DT = data.table(a=1:2,b=1:6,key="a") test(349, DT[J(2:3),.N,nomatch=NA]$N, c(3L,0L)) test(350, DT[J(2:3),.N,nomatch=0]$N, c(3L)) # Test first .N==0 with nomatch=NA|0 test(350.1, DT[J(4),.N]$N, 0L) test(350.2, DT[J(0:4),.N]$N, c(0L,3L,3L,0L,0L)) # Test recycling list() on RHS of := DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12) test(351, DT[,c("a","b"):=list(13:15),with=FALSE], data.table(a=13:15,b=13:15,c=7:9,d=10:12)) test(352, DT[,letters[1:4]:=list(1L,NULL),with=FALSE], data.table(a=c(1L,1L,1L),c=c(1L,1L,1L))) # Test assigning new levels into factor columns DT = data.table(f=factor(c("a","b")),x=1:4) test(353, DT[2,f:="c"], data.table(f=factor(c("a","c","a","b")),x=1:4)) test(354, DT[3,f:=factor("foo")], data.table(f=factor(c("a","c","foo","b")),x=1:4)) # Test growVector logic when adding levels (don't need to grow levels for character cols) newlevels = as.character(as.hexmode(1:2000)) DT = data.table(f=factor("000"),x=1:2010) test(355, DT[11:2010,f:=newlevels], data.table(f=factor(c(rep("000",10),newlevels)),x=1:2010)) DT = data.table(f=c("a","b"),x=1:4) # Test coercing factor to character column test(355.5, DT[3,f:=factor("foo")], data.table(f=c("a","b","foo","b"),x=1:4)) test(355.6, DT[4,f:=factor("bar"),verbose=TRUE], data.table(f=c("a","b","foo","bar"),x=1:4), output="Coerced factor to character to match the column") # See datatable-help post and NEWS item for 1.6.7 DT = data.table(X=factor(letters[1:10]), Y=1:10) DT$X = "Something Different" test(356, DT, data.table(X=factor("Something Different",levels=c(letters[1:10],"Something Different")), Y=1:10)) DT = data.table(X=letters[1:10], Y=1:10) DT$X = "Something Different" test(356.5, DT, data.table(X="Something Different", Y=1:10)) # Bug fix 1570 DT = data.table(x=1:5,y=1:5) test(357, DT[x==0, y:=5L], data.table(x=1:5,y=1:5)) test(358, DT[FALSE, y:=5L], data.table(x=1:5,y=1:5)) # Bug fix 1599 DT = data.table(a=1:2,b=1:6) test(359, DT[,sum(b),by=NULL], data.table(V1=21L)) test(360, DT[,sum(b),by=character(0)], data.table(V1=21L)) # Bug fix 1576 : NULL j results in 'inconsistent types' error DT = data.table(a=1:3,b=1:9) ans = data.table(a=c(1L,3L),V1=c(12L,18L)) test(361, DT[,if (a==2) NULL else sum(b),by=a], ans) test(362, DT[,if (a==2) data.table(NULL) else sum(b),by=a], ans) test(363, DT[,if (a==2) as.list(NULL) else sum(b),by=a], ans) test(364, DT[,if (a==2) integer(0) else sum(b),by=a], ans) # Test that data.table() can create list() columns directly # NB: test 235 above ('by' when DT contains list columns) created the list column in two steps, no longer necessary DT = data.table(a=1:2,b=list("h",7:8)) test(365, DT[1,b], list("h")) # should it be a special case for 1-item results to unlist? Don't think so: in keeping with no drop=TRUE principle test(366, DT[2,b], list(7:8)) DT = data.table(a=1:4,b=list("h",7:8),c=list(matrix(1:12,3),data.table(a=letters[1:3],b=list(1:2,3.4,"k"),key="a"))) test(367, DT[3,b], list("h")) test(368, DT[4,b], list(7:8)) test(369, DT[3,c[[1]][2,3]], 8L) test(370, DT[4,c[[1]]["b",b]$b[[1]]], 3.4) # Test returning a list() column via grouping DT = data.table(x=INT(1,1,2,2,2),y=1:5) test(371, DT[,list(list(unique(y))),by=x], data.table(x=1:2,V1=list(1:2,3:5))) # Test matrix i is an error test(372, DT[matrix(1:2,ncol=2)], error="i is invalid type (matrix)") # Tests from bug fix #1593 DT = data.table(x=letters[1:2], y=1:4) DT[x == "a", ]$y <- 0L test(373, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L))) DT = data.table(x=letters[1:2], y=1:4, key="x") DT["a", ]$y <- 0L test(374, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L), key="x")) DT = data.table(x=letters[1:2], y=1:4) DT[c(1,3), ]$y <- 0L test(375, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L))) # Test unique on unsorted tables (and tolerance on numeric columns, too) DT = data.table(a=c(2,1,2),b=c(1,2,1)) test(376, unique(DT), data.table(a=c(2,1),b=c(1,2))) # From the SO thread : M = matrix(sample(2, 120, replace = TRUE), ncol = 3) DF = as.data.frame(M) DT = as.data.table(M) test(377, as.data.table(unique(DF)), unique(DT)) # Test compatibility with sqldf. sqldf() does a do.call("rbind" with empty input, # so this tests ..1 when NULL (which was insufficiently list(...)[[1]] in 1.6.6). # We now test this directly rather than using sqldf, because we couldn't get 'R CMD check' # past "(converted from warning) closing unused connection 3 (/tmp/RtmpYllyW2/file55822c52)" test(378, cbind(), NULL) test(379, rbind(), NULL) DT = data.table(a=rep(1:3,1:3),b=1:6) test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error="locked binding") # .SD locked for 1st group test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error="locked binding") # .SD locked in 2nd group onwards too # test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557. test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3))) test(383, DT[,{z=10L;b:=z},by=a], error=":= and `:=`(...) are defined for use in j, once only and in particular ways") test(384, DT[,{mySD=copy(.SD);mySD[1,b:=99L];mySD},by=a], data.table(a=rep(1:3,1:3),b=c(99L,99L,4L,99L,6L,6L))) # somehow missed testing := on logical subset with mixed TRUE/FALSE, reported by Muhammad Waliji DT = data.table(x=1:2, y=1:6) test(385, DT[x==1, y := x], data.table(x=1:2,y=c(1L,2L,1L,4L,1L,6L))) test(386, DT[c(FALSE,TRUE),y:=99L], data.table(x=1:2,y=c(1L,99L,1L,99L,1L,99L))) # test that column names have the appearance of being local in j (can assign to them ok), bug #1624 DT = data.table(name=c(rep('a', 3), rep('b', 2), rep('c', 5)), flag=FALSE) test(387, DT[,{flag[1]<-TRUE;list(flag=flag)}, by=name], DT[c(1,4,6),flag:=TRUE]) DT = data.table(score=1:10, name=c(rep('a', 4), rep('b',2), rep('c', 3), 'd')) test(388, DT[,{ans = score[1] score[1] <- -score[1] ans },by=name], data.table(name=letters[1:4],V1=c(1L,5L,7L,10L))) # Tests 389-394 (character grouping and sorting) now at the start of this file, so that any # errors elsewhere show up in the last 13 lines displayed by CRAN checks. # Test unique.data.table for numeric columns within tolerance, for consistency with # with unique.data.frame which does this using paste. DT = data.table(a=tan(pi*(1/4 + 1:10)),b=42L) # tan(...) from example in ?all.equal. test(395, all.equal(DT$a, rep(1,10))) test(396, length(unique(DT$a))>1) # 10 unique values on all CRAN machines (as of Nov 2011) other than mac (5 unique) test(397, unique(DT), DT[1]) # before v1.7.2 unique would return all 10 rows. For stability within tolerance, data.table has it's own modified numeric sort. test(398, duplicated(DT), c(FALSE,rep(TRUE,9))) DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6)) test(399, unique(DT), DT[c(1,2,5)]) test(400, duplicated(DT), c(FALSE,FALSE,TRUE,TRUE,FALSE,TRUE)) DT[c(2,4,5),a:=NA] test(401, unique(DT), DT[c(1,2,3,6)]) test(402, duplicated(DT), c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE)) # Test NULL columns next to non-NULL, #1633 DT = data.table(a=1:3,b=4:6) test(403, DT[,list(3,if(a==2)NULL else b),by=a], data.table(a=1:3,V1=3,V2=c(4L,NA_integer_,6L))) test(404, DT[,list(3,if(a==1)NULL else b),by=a], error="Please use a typed empty vector instead.*such as integer.*or numeric") test(405, DT[,list(3,if(a==1)numeric() else b),by=a], error="Column 2 of result for group.*integer.*double.*types must be consistent for each group") test(406, DT[,list(3,if(a==1)integer() else b),by=a], data.table(a=1:3,V1=3,V2=c(NA_integer_,5:6))) # Test that first column can be list, #1640 test(407, data.table(list(1:2,3:5)), as.data.table(list(list(1:2,3:5)))) # With over-allocation, null data.table has truelength 100. Replaced the calls to structure() in the # code to new null.data.table(), so test internal function. User may have changed default, so this # doesn't test "100" explicitly. test(408, null.data.table(), data.table(NULL)) test(408.5, data.table(), data.table(NULL)) # Test that adding a column using := is fully by reference rather than a shallow copy, #1646 DT = data.table(1:2,3:4) # list vector truelength 100 DT2 = DT DT2[,y:=10L] test(409, DT, DT2) test(410, DT, data.table(1:2,3:4,y=10L)) DT2[1,V1:=99L] test(411, DT, DT2) test(412, DT, data.table(c(99L,2L),3:4,y=10L)) # Test that cbind dispatched to data.table() and retains keys DT = data.table(x=c("a","b"),y=1:4,key="x") test(413.1, key(cbind(DT,DT)), NULL) # key dropped because name "x" ambiguous DT1 = data.table(z = c(1,2), w = 1:4, key = "z") test(413.2, key(cbind(DT,DT1)), c("x", "z")) test(413.3, key(cbind(colA=10:13, DT)), "x") # data.table() dispatched even though 1st argument isn't data.table test(413.4, key(cbind(colA=10:17, DT)), NULL) # DT recycled so key is dropped test(413.5, key(cbind(colA=1, DT)), "x") # DT not recycled so key retained test(414.1, key(cbind(DT,as.data.frame(DT1))), "x") test(414.2, cbind(as.data.frame(DT),DT1), data.frame(DT,DT1)) # cbind(DF,...) should return a data.frame for consistency with base. Package treemap (at least) depends # on this in the return() in treepalette(). # Use data.table(DF,DT) if a data.table result is required. # Test friendly error when := is used in wrong place test(415, x:=1, error="defined for use in j, once only and in particular ways") # Somehow never tested that X[Y] is error if X is unkeyed. DT = data.table(a=1:3,b=4:6) test(416, DT[J(2)], error="x must be keyed") # Test shallow copy warning from := adding a column, and (TO DO) only when X is NAMED. DT = data.table(a=1:3,b=4:6) test(417, alloc.col(DT,3), DT, warning="Attempt to reduce allocation from.*to 3 ignored. Can only increase allocation via shallow copy") old = getOption("datatable.alloccol") # search for "r-devel" note in this file why not in one step here options(datatable.alloccol=3L) DT = data.table(a=1:3,b=4:6) options(datatable.alloccol=old) DT2 = DT test(418, length(DT)==2 && truelength(DT)==3) DT[,c:=7L] # uses final slot test(419, DT, DT2) test(420, length(DT)==3 && truelength(DT)==3 && length(DT2)==3 && truelength(DT2)==3) test(421, DT[,d:=8L,verbose=TRUE], output="Growing vector of column pointers from") test(422, length(DT)==4) test(423, truelength(DT)>=4) # with default alloccol, new tl will be 103. But user might have set that higher and then be running test.data.table(), or user might have set alloccol to just ncol(DT)+1. Hence just >=4. # Test crash bug fixed, #1656, introduced with the 1.7.0 feature DT <- data.table(a = factor(c("A", "Z")), b = 1:4) DT[1,1] <- "Z" test(424, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) test(425, DT[1,1] <- 1, 1, warning="Coerced 'double' RHS to 'integer'") test(426, DT, data.table(a=factor(c("A","Z")),b=1:4)) DT[1,1] <- 2L test(427, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) DT[1,a:="A"] test(428, DT, data.table(a=factor(c("A","Z","A","Z")),b=1:4)) DT[1,a:=2L] test(429, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) test(430, DT[1,1]<- 3L, NA_integer_, warning="RHS contains 3 which is outside the levels range.*1,2.*of column 1, NAs generated") test(431, DT[1,1:=4L,with=FALSE], data.table(a=factor(c(NA,"Z","A","Z")),b=1:4), warning="RHS contains 4 which is outside the levels range.*1,2.*of column 1, NAs generated") # simple realloc test if (is.null(getOption("datatable.alloccol"))) { DT = data.table(a=1:3,b=4:6) test(432, truelength(DT), 100L) alloc.col(DT,200) test(433, truelength(DT), 200L) DT = alloc.col(DT,300) # superfluous in this example, but shouldn't fail test(434, truelength(DT), 300L) DT2 = alloc.col(DT,400) test(435, truelength(DT), 400L) test(436, truelength(DT2), 400L) } # test that alloc.col assigns to wherever object is DT = data.table(a=1:3,b=4:6) f = function() { alloc.col(DT,200) # DT isn't local so (via inherits=TRUE) it finds in frame above invisible() } f() test(437, truelength(DT), 200L) # quick test that [<- over allocates (again) after the copy of length via *tmp* DT = data.table(a=1:3,b=4:6) tl = truelength(DT) DT$foo = 7L test(438, truelength(DT), tl) DT[,"bar"] = 8L test(439, truelength(DT), tl) test(440, DT, data.table(a=1:3,b=4:6,foo=7L,bar=8L)) # Test rbind works by colname now, for consistency with base, FR#1634 DT = data.table(a=1:3,b=4:6) test(441, rbind(DT,list(a=4L,b=7L)), data.table(a=1:4,b=4:7)) test(442, rbind(DT,data.frame(a=4L,b=7L)), data.table(a=1:4,b=4:7)) test(443, rbind(DT,data.table(a=4L,b=7L)), data.table(a=1:4,b=4:7)) test(444, rbind(DT,list(b=7L,a=4L)), data.table(a=1:4,b=4:7), warning="Argument 2 has names in a different order.") test(445, rbind(DT,data.frame(b=7L,a=4L)), data.table(a=1:4,b=4:7), warning="Argument 2 has names in a different order.") test(446, rbind(DT,data.table(b=7L,a=4L)), data.table(a=1:4,b=4:7), warning="Argument 2 has names in a different order.") test(450, rbind(DT,list(c=4L,a=7L)), error="Some colnames of argument 2 (c) are not present in colnames of item 1") test(451, rbind(DT,data.frame(c=4L,a=7L)), error="Some colnames of argument 2 (c) are not present in colnames of item 1") test(452, rbind(DT,data.table(c=4L,a=7L)), error="Some colnames of argument 2 (c) are not present in colnames of item 1") test(453, rbind(DT,list(4L,7L)), data.table(a=1:4,b=4:7)) # Test new use.names argument in 1.8.0 test(453.1, rbind(DT,list(FOO=4L,BAR=7L),use.names=FALSE), data.table(a=1:4,b=4:7)) test(453.2, rbind(DT,data.table(b=4:5,a=7:8), use.names=FALSE), data.table(a=1:5,b=4:8)) # Test the linked reported bug, #1645 A1 = data.table(b='hello', a='foo', key='a') A2 = data.table(a=c('foo', 'bar'), key='a') test(454, merge(A1, A2, all.y=TRUE, by='a'), data.table(a=c("bar","foo"),b=c(NA,"hello"),key="a")) A1 = data.table(a='foo', b='hello', key='a') test(455, merge(A1, A2, all.y=TRUE, by='a'), data.table(a=c("bar","foo"),b=c(NA,"hello"),key="a")) # Test mixing nomatch=0 and mult="last", bug #1661 DT = data.table(id=c(1L, 2L, 2L, 3L), val=1:4, key="id") test(456, DT[J(c(1,2,4)), mult="last", nomatch=0], data.table(id=1:2,val=c(1L,3L),key="id")) # Test join inherited scope respexts nomatch=0, #1663 DT2 = data.table(id=c(1L,2L,4L), val2=c(11,12,14),key="id") test(457, DT[DT2, list(val, val2), nomatch=0], data.table(id=c(1L,2L,2L),val=1:3,val2=c(11,12,12),key="id")) # Test bysameorder edge cases, #1631 DT = data.table(a=1:3,v=4:9,key="a") test(458, DT[,sum(v),by=list(a%%2L)], data.table(a=c(1L,0L),V1=c(26L,13L))) test(459, DT[, list(sum(v)), list(ifelse(a == 2, NA, 1L))], data.table(ifelse=c(1L,NA_integer_),V1=c(26L,13L))) test(460, DT[, list(sum(v)), list(ifelse(a == 2, 1, NA))], data.table(ifelse=c(NA_real_,1),V1=c(26L,13L))) test(461, DT[,sum(v),by=a], data.table(a=1:3,V1=c(11L,13L,15L),key="a")) # Test loading from file (which resets tl to 0 in R 2.14.0+, and unitialized random number in 2.13.2-) f = tempfile() save(list="DT",file=f) load(f) test(462, DT[,foo:=10L], data.table(a=1:3,v=4:9,foo=10L,key="a")) unlink(f) # Test CJ problems with v1.7.4, #1689 test(463, all(sapply(CJ(1:2,1:3),length)==6L)) DT = data.table(x=1:4,y=1:2,cnt=1L,key="x,y") test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA)) test(465, DT[CJ(1:4,1:4), sum(cnt>0)]$y, rep(1:4,4)) f1 = factor(c("READING","MATHEMATICS")) f2 = factor(c("2010_2011","2009_2010","2008_2009"), levels=paste(2006:2010,2007:2011,sep="_")) test(466, all(sapply(CJ(f1, f2),length)==6L)) # Test list(.SD,newcol=..) gives error with guidance DT = data.table(a=1:2,v=3:6) test(467, DT[,list(newcol=7L,.SD),by=a], error="Error.*use := by group instead") # Test empty list column DT = data.table(a=1:3,b=4:6) test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list())) # Test plonk list test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14))) # Test plonk list variable (to catch deparse treating j=list() specially) x = list(2,"b",2.718) test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) # Test recycling list DT = data.table(a=1:4,b=5:8) test(471, DT[,foo:=list("a",2:3)], data.table(a=1:4,b=5:8,foo=list("a",2:3,"a",2:3))) # Test recycling singleton list DT[,foo:=NULL] test(472, DT[,foo:=list(list(2:3))], data.table(a=1:4,b=5:8,foo=list(2:3,2:3,2:3,2:3))) # Test adding new column with a recycled factor, #1691 DT = data.table(a=1:4,b=5:8) DT[,c:=factor("a")] test(473, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")))) DT[,d:=factor(c("a","b"))] test(474, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")),d=factor(c("a","b","a","b")))) # Test scoping error introduced at 1.6.1, unique(DT) when key column is 'x' DT=data.table(x=c("a", "a", "b", "b"), y=c("a", "a", "b", "b"), key="x") test(475, unique(DT), data.table(x=c("a","b"),y=c("a","b"),key="x")) # Test character and list columns in tables with many small groups N = if (.devtesting) 1000L else 100L DT = data.table(grp=1:(2*N),char=sample(as.hexmode(1:N),4*N,replace=TRUE),int=sample(1:N,4*N,replace=TRUE)) ans = DT[,list(p=paste(unique(char),collapse=","), i=list(unique(int))), by=grp] test(476, nrow(as.matrix(ans)), 2L*N) # The as.matrix triggers the "'getCharCE' must be called on a CHARSXP", or similar symptom of earlier corruption, before fix in dogroups.c. # Test that plonking from calling scope works, even after removing, and column copy via := is ok too. DT = data.table(a=1:3) foo = 4:6 DT[,foo:=foo] rm(foo) gc() DT[,foo2:=foo] DT[2,foo:=10L] DT[3,foo2:=11L] gc() test(477, DT, data.table(a=1:3,foo=c(4L,10L,6L),foo2=c(4L,5L,11L))) test(478, DT[,foo:=foo], DT) # does nothing, with no warning, consistent with base R `a<-a`. # Test that recycling now works with oversized inputs and % != 0 length, both with warnings. DT = data.table(x=1:4) test(479, DT[, a:=5:7], data.table(x=1:4,a=c(5:7,5L)), warning="Supplied 3 items to be assigned to 4 items of column 'a' (recycled leaving remainder of 1 items)") # Test that multiple columns can be added DT = data.table(x=1:4) test(481, DT[, c("foo","bar"):=list(10L,11:14), with=FALSE], data.table(x=1:4,foo=10L,bar=11:14)) # and combined with update and add in one step test(482, DT[, c("foo","baz"):=list(12L,15:18), with=FALSE], data.table(x=1:4,foo=12L,bar=11:14,baz=15:18)) # Test that errors in := do not leave DT in bad state, #1711 DT = data.table(x=1:4) test(483, DT[,c("foo","bar"):=list(20L,numeric()),with=FALSE], error="RHS of assignment to new column.*bar.*is zero length but not empty list") test(484, DT, data.table(x=1:4)) # i.e. DT as it was before, without foo being added as it did in v1.7.7- # Test i's key longer than x's d1 <- data.table(a=1:2, b=11:14, key="a,b") d2 <- data.table(A=0:1, B=1:4, key="A") test(485, d2[d1, allow.cartesian=TRUE], data.table(A=INT(1,1,1,1,2,2),B=INT(2,4,2,4,NA,NA),b=INT(11,11,13,13,12,14),key="A")) test(486, d2[d1,sum(B)], data.table(A=INT(1,1,2,2),V1=INT(6,6,NA,NA),key="A")) # no allow.cartesian needed due to by-without-by if ("package:reshape" %in% search()) { DT <- data.table(ID=rep(1:3, each=3), TIME=rep(1:3, 3), X=1:9) test(487, data.table(reshape(DT, idvar="ID", timevar="TIME", direction="wide")), data.table(ID=1:3,X.1=INT(1,4,7),X.2=INT(2,5,8),X.3=INT(3,6,9))) # The data.table() around reshape is to drop reshape's attributes. DT <- data.table(ID=rep(1:3, each=3), TIME=rep(1:3, 3), X=1:9, Y=10:18) test(488, data.table(reshape(DT, idvar="ID", timevar="TIME", direction="wide")), data.table(ID=1:3,X.1=INT(1,4,7),Y.1=INT(10,13,16),X.2=INT(2,5,8),Y.2=INT(11,14,17),X.3=INT(3,6,9),Y.3=INT(12,15,18))) } else { cat("Tests 487 and 488 not run. If required call library(reshape) first.\n") } # Test warnings for names<- and colnames<-, but only warnings when caller is data.table aware. DT = data.table(a=1:3,b=4:6) test(489, names(DT)[1]<-"A", "A", warning="names(x)<-value syntax.*Please change to setnames") test(490, names(DT), c("A","b")) test(491, colnames(DT)[2]<-"B", "B", warning="colnames(x)<-value syntax.*Please change to setnames") test(492, names(DT), c("A","B")) # Check setnames out of bounds errors test(493, setnames(DT,"foo","bar"), error="not found.*foo") test(494, setnames(DT,3,"bar"), error="outside range.*3") # Test new function setcolorder() DT = data.table(a=1:2,b=3:4,c=5:6) test(495, setcolorder(DT,c(2,1,3)), data.table(b=3:4,a=1:2,c=5:6)) test(496, setcolorder(DT,c(2,1,3)), data.table(a=1:2,b=3:4,c=5:6)) test(497, setcolorder(DT,c("c","a","b")), data.table(c=5:6,a=1:2,b=3:4)) test(498, setcolorder(DT,"a"), error="neworder is length") test(498.1, setcolorder(DT,c("d","a","b")), error="Names in neworder not found in x: d") # test first group listens to nomatch when j uses join inherited scope. x <- data.table(x=c(1,3,8),x1=10:12, key="x") y <- data.table(x=c(3,8,10),y1=10:12, key="x") test(499, y[x,x1,nomatch=0], data.table(x=c(3,8),x1=11:12, key="x")) test(500, y[x,x1,nomatch=NA], data.table(x=c(1,3,8),x1=10:12, key="x")) # Test merge bug of unkeyed tables introduced in 1.6.8 and 1.6.9 reported by Eric, and ... dt1 <- data.table(l = factor(c("a","b","a","b"))) dt2 <- data.table(l = factor(c("a","b")), L = factor(c("A","B"))) test(501, setkey(merge(dt1,dt2,by="l"),NULL), as.data.table(merge(as.data.frame(dt1), as.data.frame(dt2), by="l"))) dt1 <- data.table(l = c("a","b","a","b")) dt2 <- data.table(l = c("a","b"), L = c("A","B")) test(501.5, setkey(merge(dt1,dt2,by="l"),NULL), as.data.table(merge(as.data.frame(dt1), as.data.frame(dt2), by="l"))) # ... similar example from DM dtA = data.table(i = 1:8, j = rep(1:2, 4), k = rep(1:4, 2), A = 10:17) dtB = data.table(j = rep(1:2, 2), k = 1:4, B = 18:21) test(502, merge(dtA, dtB, by = c("j","k"), all.x = TRUE), data.table(j=rep(1:2,each=4), k=rep(INT(1,3,2,4),each=2), i=INT(1,5,3,7,2,6,4,8), A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key="j,k")) test(503, dtA$i, 1:8) # check that merge didn't change the order of dtA by reference test(504, dtB$k, 1:4) # or dtB # Test new i. JIS prefix in 1.7.10 DT = data.table(a=1:2,b=1:4,key="a") test(505, DT[J(a=1,b=6),sum(i.b*b)]$V1, 24) # 24 now 'double' because i.b is 'double' # Test := after a key<- DT = data.table(a=3:1,b=4:6) test(506, key(DT)<-"a", "a", warning="can copy the whole table") test(508, DT, data.table(a=1:3,b=6:4,key="a")) test(509, DT[,b:=10L], data.table(a=1:3,b=10L,key="a")) test(510, DT[,c:=11L], data.table(a=1:3,b=10L,c=11L,key="a")) # Used to be warning about invalid .internal.selfref detected and fixed. As from v1.8.3 data.table() returns a NAMED==0 object, and key<- appears not to copy that. But within functions, key<- would still copy. TO DO: add tests.... #test(511,) # Test new functons chmatch and %chin% y=letters x=c(sample(letters,12),"foo","bar") test(512, chmatch(x,y), match(x,y)) test(513, chmatch(x,y,nomatch=0), match(x,y,nomatch=0)) test(514, x %chin% y, x %in% y) # Test new function set() in v1.8.0 DT = data.table(a=1:3,b=4:6) test(515, set(DT,2,1,3), data.table(a=c(1L,3L,3L),b=4:6), warning="Coerced i") test(516, set(DT,"2",1,3), error="i is type 'character'") test(517, set(DT,2L,1,3), DT, warning="Coerced j") # FR #2551 implemented - removed warning from 518 # test(518, set(DT,2L,1L,3), DT, warning="Coerced 'double' RHS to 'integer'") test(518, set(DT,2L,1L,3), DT) test(519, set(DT,2L,1L,3L), data.table(a=INT(1,3,3),b=4:6)) test(520, set(DT,2L,"a",2L), data.table(a=1:3,b=4:6)) test(521, set(DT,2:3,"b",7:8), data.table(a=1:3,b=INT(4,7,8))) test(522, set(DT,2L,"foo",7L), data.table(a=1:3,b=INT(4,7,8), foo=INT(NA,7,NA))) # error="foo.*is not a column name[.] Cannot add columns with set.*use := instead") test(523, set(DT,2L,c("a","a"),list(9L,10L)), error="Can't assign to the same column twice in the same query (duplicates detected).") test(523.1, set(DT,2L,"a",10L), data.table(a=INT(1,10,3),b=INT(4,7,8), foo=INT(NA,7,NA))) setkey(DT,b) test(524, set(DT,2L,"a",2L), data.table(a=1:3, b=INT(4,7,8), foo=INT(NA,7,NA), key="b")) test(525, set(DT,1L,"b",6L), data.table(a=1:3, b=6:8, foo=INT(NA,7,NA))) test(525.1, set(DT,j="b",value=9:11), data.table(a=1:3, b=9:11, foo=INT(NA,7,NA))) # plonk syntax via missing i (fixed in 1.8.1) test(525.2, set(DT,NULL,"b",12:14), data.table(a=1:3, b=12:14, foo=INT(NA,7,NA))) # plonk syntax via NULL i # NEW ADDITIONAL TESTS FOR set() - bug #2077 - for using set to add columns by reference DT1 <- data.table(x = 1, y = 1:10, fac = sample(LETTERS[1:3], 10, replace = TRUE)) # from SO DT2 <- copy(DT1) mul=c(5.3,2.8) for (j in seq_along(mul)) set(DT1, i=NULL, j=paste("dot", j, sep=""), mul[j]*DT1[[j]]) DT2[, `:=`(dot1=5.3*x, dot2=2.8*y)] test(1096.1, DT1, DT2) set(DT1, i=NULL, j="dot2", value=NULL) # remove "dot2" test(1096.2, DT1, DT2[, list(x,y,fac, dot1)]) DT2[, dot2 := NULL][5:9, `:=`(bla1 = 0L, x = 3L, bla2 = 2L)] set(DT1, i=5:9, j=c("bla1", "x", "bla2"), value=list(0L, 3L, 2L)) test(1096.3, DT1, DT2) # more testing with many columns including existing columns test(1096.4, set(DT1, i=NULL, j=7L, value=5L), error="Item 1 of column numbers in j is 7 which is outside range.*1.*6.*Use column names instead in j to add new columns.") # Test that data.frame incompability is fixed, came to light in Feb 2012 DT = data.table(name=c('a','b','c'), value=1:3) test(526, base::droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # base:: because we'll implement a fast droplevels, too. if ("package:nlme" %in% search()) { test(527, {x=Orthodont;tt=lme(distance ~ age, data=x); tt[["data"]]=NULL; tt}, {x=as.data.table(Orthodont);tt=lme(distance ~ age, data=x);tt[["data"]]=NULL;tt}) test(528, {x=iris;tt=groupedData( Sepal.Length ~ Sepal.Width | Species, data=x);attr(tt,"class")=NULL;attr(tt,"FUN")=NULL;tt}, {x=as.data.table(iris);tt=groupedData( Sepal.Length ~ Sepal.Width | Species, data=x);attr(tt,"class")=NULL;attr(tt,"FUN")=NULL;attr(tt,".internal.selfref")=NULL;tt}) } # Speed test of chmatch vs match. # sortedmatch was 40 times slower and the wrong approach, removed in v1.8.0. # Example from Tom in Jan 2011 who first found and raised the issue with sortedmatch. if (.timingtests) { cat("Running 30sec (max) test ...");flush.console() n = 1e6 a = as.character(as.hexmode(sample(n,replace=TRUE))) b = as.character(as.hexmode(sample(n,replace=TRUE))) test(529, system.time(ans1<-match(a,b))["user.self"] > system.time(ans2<-chmatch(a,b))["user.self"]) test(530, ans1, ans2) # sorting a and b no longer makes a difference since both match and chmatch work via hash in some way or another cat("done\n") } # Test that .set_row_names() is maintained on .SD for each group DT = data.table(a=INT(1,1,2,2,2,3,3,3,3),b=1:9) test(531, DT[,length(rownames(.SD)),by=a], data.table(a=1:3,V1=2:4)) # Test column names with spaces, bug#1880, and check.names default is now FALSE, too # Thanks to Yang Zhang for the tests. DT = data.table("a b"=INT(1,1,2,2,2),c=1:5) test(532, DT[,sum(c),by="a b"], data.table("a b"=1:2,V1=c(3L,12L))) test(533, names(data.table('a b'=1)[, list('c d'=`a b`)]), "c d") test(534, names(transform(data.table('a b'=1), `c d`=`a b`)), c("a b","c d")) # Test keyby, new in v1.8.0 DT = data.table(a=INT(1,3,1,2,3,2),b=1:2,c=1:3,v=1:6) test(535, DT[,sum(v),by=a, keyby=a], error="not both") test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retains appearance order ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) test(538, DT[,sum(v),keyby="a"], ans) var="a" test(539, DT[,sum(v),keyby=eval(var)], ans) a=quote(a%%2L) test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2)) test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2)) test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma") # Test single expressions passed to by, FR#1743 in v1.8.0 DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8) test(545, DT[,sum(v),by=a%%2L], data.table(a=1:0,V1=c(12L,14L))) test(546, DT[,sum(v),by=month(date)], data.table(month=2:3,V1=c(11L,15L))) # Test that factor levels no longer need to be sorted, and that 'ordered' class is retained. # Posted by Allan Engelhardt ... x = factor(LETTERS[1:3], levels=rev(LETTERS), ordered=TRUE) DT = data.table(A=x,B=x,v=1:3, key="A") test(547,is.ordered(DT$A) && is.ordered(DT$B)) test(548.1, DT["A",v,verbose=TRUE], output="Coercing character column i.'A' to factor to match type of x.'A'") # changed i.V1 to i.A as per FR #2693 test(548.2, DT["A",v]$v,1L) # Posted by Damian Betebenner ... set.seed(123) my.course.sample = sample(1:5, 10, replace=TRUE) Y = factor(my.course.sample, levels=1:5, labels=c("Basic Math", "Calculus", "Geometry", "Algebra I", "Algebra II")) DT = data.table(ID=1:10, COURSE=Y) test(549, DT[,sum(ID),by=COURSE]$V1, INT(1,2,29,17,6)) setkey(DT, COURSE) test(550, DT[,sum(ID),by=key(DT)]$V1, INT(6,1,29,2,17)) # Another test of DT[i] syntax from datatable-unaware packages, #1794 from ilprincipe. DF = structure(list(sample = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("panel.1yr", "panel.2yr", "panel.3yr", "panel.inc", "pre.inc", "pre.prev", "post.inc", "post.prev"), class = "factor"), base = c(2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002), ref = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2004", "2002-2004", "2001", "2000", "2009", "2008"), class = "factor"), var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("distance", "time"), class = "factor"), treated = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1), distance = c(10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05), all = c(602L, 6357L, 8528L, 9272L, 435L, 2438L, 3456L, 6360L, 245L, 2693L, 3699L, 4084L, 187L, 983L, 1400L, 2660L), di.recip = c(5L, 39L, 57L, 62L, 4L, 16L, 22L, 45L, 2L, 25L, 36L, 37L, 1L, 11L, 16L, 35L), irr = c(0.00830564784053156, 0.00613496932515337, 0.00668386491557223, 0.00668679896462468, 0.00919540229885057, 0.00656275635767022, 0.00636574074074074, 0.00707547169811321, 0.00816326530612245, 0.00928332714444857, 0.0097323600973236, 0.00905974534769833, 0.0053475935828877, 0.0111902339776195, 0.0114285714285714, 0.0131578947368421)), .Names = c("sample", "base", "ref", "var", "treated", "distance", "all", "di.recip", "irr"), row.names = c(NA, 16L), class = "data.frame") DT = as.data.table(DF) test(551, nrow(reshape(DT, v.names = c("all", "di.recip", "irr"), timevar = "treated", idvar = c("sample", "var", "distance"), direction = "wide" )), 8L) # Test bug report #1275 from S Bagley : DT = data.table(a=c("1","1"), b=c(2,2)) test(552, is.character(DT$a)) test(553, unique(DT), data.table(a="1",b=2)) # Test bug #1726 from Ivan Zhang. DT = data.table(V1=c('a', 'b', 'a'), V2 = c('hello', 'ello', 'llo')) test(554, nrow(DT[V1=='a' & V2 %like% 'll']), 2L) test(555, nrow(DT[V1=='a' & V2 %like% 'ello']), 1L) # Test can't := to .SD, #1727 DT = data.table(x = 1:5, y = rnorm(5)) test(556, DT[,.SD[,z:=rnorm(1)],by=x], error="[.]SD is locked.*reserved for possible future use") f = function(.SD) .SD[,z:=rnorm(1)] test(557, DT[, f(.SD), by=x], error="[.]SD is locked.*reserved for possible future use") # Test printing on nested data.table, bug #1803 DT = data.table(x=letters[1:3],y=list(1:10,letters[1:4],data.table(a=1:3,b=4:6))) test(558, capture.output(print(DT)), c(" x y","1: a 1,2,3,4,5,6,","2: b a,b,c,d","3: c ")) test(559, setkey(DT,x)["a",y][[2]][[1]], 1:10) # y is symbol representing list column, specially detected in dogroups # Test renaming of .N to N DT = data.table(a=INT(1,1,2,2,2),b=INT(1,2,2,2,1)) test(560.1, DT[,.N,a][,.N], 2L) test(560.2, DT[,.N,a][,N], 2:3) test(561, DT[,.N,a][,N], 2:3) test(562, DT[,list(.N),a][,N], 2:3) test(563, DT[,.N,a][,unique(.N),a]$V1, c(1L,1L)) test(564, DT[,.N,a][,unique(N),a]$V1, 2:3) test(565, DT[,.N,a][N>2], data.table(a=2L, N=3L)) test(566, DT[,list(.N=.N),a][.N>2], data.table(a=2L,.N=3L)) test(567, DT[,.N,list(a,b)][,N,by=a]$N, c(1L,1L,2L,1L)) test(568, DT[,.N,list(a,b)][,unique(N),by=a]$V1, c(1L,2L,1L)) test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be grouped because") test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") # Test spaces in by="..." format, datatable-help on 31 March DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) test(573, DT[,sum(v),by="b, a"], error="object ' a' not found") # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) test(574, dim(unname(DT)), 3:2) # Test that CJ retains explicit names (useful if used independently) test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="x,y")) test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="V1,y")) test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key="x,V2")) # Test factor to character join when factor contains unused and reverse order levels : X = data.table(a=LETTERS[1:4],v=1:4,key="a") Y = data.table(a=factor(c("D","B"),levels=rev(LETTERS)),key="a") test(578, X[Y,verbose=TRUE], output="Coercing factor column i.'a' to character to match type of x.'a'") test(579, X[Y], data.table(a=c("D","B"), v=c(4L,2L))) # Test that logical i in set() returns helpful error DT = data.table(a=1:3,b=4:6) test(580, set(DT,a<3,"b",0L), error="simply wrap with which(), and take the which() outside the loop if possible for efficiency") # Test by on empty tables (and when i returns no rows), #1945 DT = data.table(a=1:3,v=1:6) test(581, DT[a<1,sum(v),by=a], data.table(a=integer(),V1=integer())) test(582, DT[a<1,sum(v),by=list(a)], data.table(a=integer(),V1=integer())) test(583, DT[a<1], DT[0]) test(584, DT[a<1], output="Empty data.table (0 rows) of 2 cols: a,v") test(585, DT[a<1,list(v)], output="Empty data.table (0 rows) of 1 col: v") test(586, data.table(a=integer(),V1=integer()), output="Empty data.table (0 rows) of 2 cols: a,V1") # Test that .N is available in by on empty table, also in #1945 test(587, DT[a<1,list(sum(v),.N),by=a], data.table(a=integer(),V1=integer(),N=integer())) # Realised that DT[NULL] returned an error. test(588, DT[NULL], data.table(NULL)) # Test that .N, .SD and .BY are available when by is missing and when by is 0 length DT = data.table(x=rep(1:3,each=3), y=c(1,3,6), v=1:9) test(589, DT[,sapply(.SD,sum)*.N], c(x=162, y=270, v=405)) test(590, DT[,sapply(.SD,sum)*.N,by=NULL], data.table(V1=c(162,270,405))) test(591, DT[,sapply(.SD,sum)*.N,by=character()], data.table(V1=c(162,270,405))) test(592, DT[,sapply(.SD,sum)*.N,by=""], data.table(V1=c(162,270,405))) test(593, DT[,lapply(.SD,sum)], data.table(x=18L, y=30, v=45L)) # bug fix #2263 in v1.8.3: now data.table result for consistency test(594, DT[,lapply(.SD,sum),by=NULL], data.table(x=18L, y=30, v=45L)) test(595, DT[,lapply(.SD,sum),by=character()], data.table(x=18L, y=30, v=45L)) test(596, DT[,lapply(.SD,sum),by=""], data.table(x=18L, y=30, v=45L)) # Test keys of two numeric columns, bug#2004 DT = data.table(x=0.0,y=c(0.0,0.1,0.0,0.2,0.0)) test(597, unique(DT), DT[c(1,2,4)]) test(598, DT[,list(count=.N),by=c("x","y")], data.table(x=0.0,y=c(0.0,0.1,0.2),count=c(3L,1L,1L))) # And that numeric NAs sort stably to the beginning. Whether NAs are allowed in keys, another issue but # ordernumtol needs to deal with NA anyway for add hoc by and unique. DT = data.table( c(1.34, 1.34, 1.34, NA, 2.22, 2.22, 1.34, NA, NA, 1.34, 0.999), c(75.1, NA, 75.1, 75.1, 2.3, 2.4, 2.5, NA, 1.1, NA, 7.9 )) test(599, DT[c(8,9,4,11,2,10,7,1,3,5,6)], setkey(setkey(DT),NULL)) set.seed(1) DT = data.table(x=rep(c(1,2), each=10), y=rnorm(20)) setkey(DT, x, y) test(600, is.sorted(DT$x)) test(601, !is.sorted(DT$y)) test(602, base::order(DT$x,DT$y), 1:20) # Test that as.list.data.table no longer copies via unclass, so speeding up sapply(DT,class) and lapply(.SD,...) etc, #2000 N = if (.devtesting) 1e6 else 1e4 DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N) # 15MB in dev testing, but test with N=1e7 if (.devtesting) test(603, system.time(sapply(DT,class))["user.self"] < 0.1) # Tests on loopability, i.e. that overhead of [.data.table isn't huge, as in speed example in example(":=") # These are just to catch slow down regressions where instead of 1s it takes 40s if (.devtesting) { # TO DO: find more robust way to turn these on for CRAN checks test(604, system.time(for (i in 1:1000) nrow(DT))["user.self"] < 0.5) test(605, system.time(for (i in 1:1000) ncol(DT))["user.self"] < 0.5) test(606, system.time(for (i in 1:1000) length(DT[[1L]]))["user.self"] < 0.5) # much faster than nrow, TO DO: replace internally } # TO DO: move to stress test script off CRAN ... # DT = as.data.table(matrix(1L,nrow=100000,ncol=100)) # test(607, system.time(for (i in 1:1000) DT[i,V1:=i])["user.self"] < 10) # 10 to be very wide margin for CRAN # test(608, DT[1:1000,V1], 1:1000) # Crash bug of chorder(character()), #2026 test(609, chorder(character()), base::order(character())) test(610, chorder(""), base::order("")) # Extra tests of chorder and chgroup x = sample(LETTERS) test(610.1, chorder(x), base::order(x)) test(610.2, chgroup(x), seq_along(x)) x = sample(LETTERS,1000,replace=TRUE) test(610.3, chorder(x), base::order(x)) test(610.4, unique(x[chgroup(x)]), unique(x)) # := by group DT = data.table(a=1:3,b=(1:9)/10) test(611, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Assign to subset ok (NA initialized in the other items) ok : test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Combining := by group with i test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. test(618, copy(DT)[a>3,r:=sum(b)], DT) test(619, copy(DT)[J(-1),r:=sum(b)], DT) test(620, copy(DT)[J(-1),r:=sum(b),nomatch=0], DT) DT = data.table(x=letters, key="x") test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT) set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) test(623, DT[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Tests on POSIXct attributes DT = data.table(a=c(1,1,2,2,2)) test(624, attributes(DT[,as.POSIXct("2011-12-13 18:50",tz="EST"),by=a][[2]]), list(class=c("POSIXct","POSIXt"),tzone="EST")) DT = data.table(x = rnorm(5)) DT$time1 <- Sys.time() # recycle via *tmp* DT$time2 <- rep(Sys.time(), 5) # plonk via *tmp* DT[,time3:=Sys.time()] # recycle DT[,time4:=rep(Sys.time(),5)] # plonk test(625, all(sapply(DT,is,"POSIXct")[-1])) # unique on ITime doesn't lose attributes, #1719 t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S")) test(626, unique(t), t[c(1,2,4)]) test(627, class(unique(t)), "ITime") # Test recycling list() rbind test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L))) # Test switch in .rbind.data.table for factor columns test(628.5, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))), data.table(a=1:4,b=factor(letters[1:4]),c=factor(c(rep("foo",3),"bar"), levels = c("foo", "bar")))) # Test merge with common names and all.y=TRUE, #2011 DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a") DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a") # 629+630 worked before anyway. 631+632 test the bug fix. # The .1 tests double check it's the same behaviour as merge.data.frame. adf=as.data.frame adt=as.data.table test(629, merge(DT1,DT2), data.table(a=c(3,5),total.x=c(1,1),total.y=c(1,2),key="a")) test(629.1, merge(DT1,DT2), setkey(adt(merge(adf(DT1),adf(DT2),by="a")),a)) test(630, merge(DT1,DT2,all.x=TRUE), data.table(a=c(1,3,4,5),total.x=c(2,1,3,1),total.y=c(NA,1,NA,2),key="a")) test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.x=TRUE)),a)) test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a")) test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a)) test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a")) test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) # Test that unsettting datatable.alloccol is caught, #2014 old = getOption("datatable.alloccol") options(datatable.alloccol=NULL) # the return value here seems to be TRUE rather than the old expression TO DO: follow up with r-devel test(633, data.table(a=1:3), error="n must be integer length 1") options(datatable.alloccol=old) # Test that with=FALSE by number isn't messed up by dup column names, #2025 DT = data.table(a=1:3,a=4:6) test(634, DT[,2:=200L,with=FALSE], data.table(a=1:3,a=200L)) # Test names when not all items are named, #2029 DT = data.table(x=1:3,y=1:3) test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. set.seed(1) DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") test(637, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)][[2]],key="a")) test(638, key(DT[J(43L),a:=99L]), NULL) setkey(DT,a) test(639, key(DT[,a:=99L,by=a]), NULL) # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) test(640, capture.output(print(DT)), c(" a b c","1: 8 xy 1.1","2: 9 x 22.1","3: 10 xyz 0.0")) DT=data.table(a=letters,b=1:26) test(641, tail(capture.output(print(DT[1:20])),2), c("19: s 19","20: t 20")) test(642, tail(capture.output(print(DT[1:21])),2), c("21: u 21"," a b")) DT=data.table(a=as.character(as.hexmode(1:500)), b=1:500) test(643, capture.output(print(DT)), c(" a b"," 1: 001 1"," 2: 002 2"," 3: 003 3"," 4: 004 4"," 5: 005 5"," --- ","496: 1f0 496","497: 1f1 497","498: 1f2 498","499: 1f3 499","500: 1f4 500")) # Test inconsistent length of columns error. DT = list(a=3:1,b=4:3) setattr(DT,"class",c("data.table","data.frame")) test(644, setkey(DT,a), error="Column 2 is length 2 which differs from length of column 1 (3)") test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of column 1 (3)") # Test faster mean. Example from (now not needed as much) data.table wiki point 3. # Example is a lot of very small groups. set.seed(100) n=1e4 # small n so as not to overload daily CRAN checks. DT=data.table(grp1=sample(1:750, n, replace=TRUE), grp2=sample(1:750, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect. DT[c(3,4),y:=NA] tt1 = system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2)]) # 1.1s tt2 = system.time(ans2<-DT[,list(.Internal(mean(x)),.Internal(mean(y))),by=list(grp1,grp2)]) # 1.1s basemean = base::mean # to isolate time of `::` itself tt3 = system.time(ans3<-DT[,list(basemean(x),basemean(y)),by=list(grp1,grp2)]) # 11s test(646, ans1, ans2) test(647, ans1, ans3) test(648, any(is.na(ans1$V1)) && !any(is.nan(ans1$V1))) if (.devtesting) test(649, tt1["user.self"] < 10*tt2["user.self"]) # should be same speed, but *10 as large margin if (.devtesting) test(650, tt1["user.self"] < tt3["user.self"]/2) # 10 times faster, but test 2 times faster as large margin tt1 = system.time(ans1<-DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)]) # 2.0s tt2 = system.time(ans2<-DT[,list(mean.default(x,na.rm=TRUE),mean.default(y,na.rm=TRUE)),by=list(grp1,grp2)]) # 5.0s test(651, ans1, ans2) test(652, any(is.nan(ans1$V1))) if (.devtesting) test(653, tt1["user.self"] < tt2["user.self"]) # See FR#2067. Here we're just testing the optimization of mean and lapply, should be comparable to above tt2 = system.time(ans2<-DT[,lapply(.SD,mean,na.rm=TRUE),by=list(grp1,grp2)]) setnames(ans2,"x","V1") setnames(ans2,"y","V2") test(654, ans1, ans2) test(655, abs(tt1["user.self"] - tt2["user.self"])<2.0) # unoptimized tt2 takes 30 seconds rather than 2. The difference between tt1 and tt2 is under 0.2 seconds usually, so 2.0 is very large margin for error to ensure it's not 30secs. test(656, DT[,mean(x),by=grp1,verbose=TRUE], output="GForce optimized j to.*gmean") test(657, DT[,list(mean(x)),by=grp1,verbose=TRUE], output="GForce optimized j to.*gmean") test(658, DT[,list(mean(x),mean(y)),by=grp1,verbose=TRUE], output="GForce optimized j to.*gmean") tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. # Test .N for logical i subset DT = data.table(a=1:10, b=rnorm(10)) test(660, DT[a==8L, .N], 1L) # Test that growing is sensible in worst case DT = data.table(a=rep(1:10,1:10),b=rnorm(55)) tt = capture.output(DT[,sum(b)*b,by=a,verbose=TRUE]) test(661, length(grep("growing from",tt))<3) # was 6 when we simply grew enough for latest result # Test that adding a new logical column is supported, #2094 DT=data.table(a=1:3) test(662, DT[,newcol:=NA], data.table(a=1:3,newcol=NA)) test(663, sapply(DT,class), c(a="integer",newcol="logical")) # Test that setting names in the presence of dups is ok, #2103 DT = data.table(a=1:3, b=2:4, a=3:5) test(664, setnames(DT, c('d','e','f')), data.table(d=1:3,e=2:4,f=3:5)) # Test by=c(...) in combination with i subset, #2078 DT = data.table(a=1:3,b=1:6,key="a") test(665, DT[a<3,sum(b),by=c("a"),verbose=TRUE], DT[a<3,sum(b),by="a"], output="i clause present and columns used in by detected") test(666, DT[a<3,sum(b),by=key(DT),verbose=TRUE], DT[a<3,sum(b),by=a], output="i clause present and columns used in by detected") test(667, DT[a<3,sum(b),by=paste("a")], error='Otherwise, by=eval(paste("a")) should work') test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") # Test := keyby does key, #2065 DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") test(670, DT[,z:=sum(y),keyby=x], ans) DT = data.table(x=1:2, y=1:6) test(671, DT[,z:=sum(y),keyby="x"], ans) DT = data.table(x=1:2, y=1:6) test(672, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning=":= keyby not straightforward character column names or list() of column names, treating as a by") DT = data.table(x=1:2, y=1:6) test(673, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) DT = data.table(x=1:2, y=1:6) test(674, DT[x>1,z:=sum(y),keyby=x], error="When i is present, keyby := on a subset of rows doesn't make sense. Either change keyby to by, or remove i") # Test new .() DT = data.table(x=1:2, y=1:6, key="x") test(675, DT[.(1L)], DT[1:3]) # Test new rbindlist l = list(data.table(a=1:2, b=7:8), data.table(a=3:4, 9:10), data.table(5:6, 11:12), data.table(b=13:14), list(15:16,17L), list(c(18,19),20:21)) test(676, rbindlist(l[1:3]), data.table(a=1:6,b=7:12)) test(677, rbindlist(l[c(10,1,10,2,10)]), data.table(a=1:4,b=7:10)) # NULL items ignored test(678, rbindlist(l[c(1,4)]), error="Item 2 has 1 columns, inconsistent with item 1 which has 2") test(679, rbindlist(l[c(1:2,5)]), error="Column 2 of item 3 is length 1, inconsistent with first column of that item which is length 2.") test(680, rbindlist(l[c(2,6)]), data.table(a=c(3,4,18,19), V2=c(9:10,20:21))) # coerces 18 and 19 to numeric (with eddi's changes in commit 1012 - highest type is preserved now) --- Caught and changed by Arun on 26th Jan 2014 (in commit 1099). ### ----> Therefore this TO DO may not be necessary here anymore (added by Arun 26th Jan 2014) ---> # TO DO when options(datatable.pedantic=TRUE): test(680.5, rbindlist(l[c(2,6)]), warning="Column 1 of item 2 is type 'double', inconsistent with column 1 of item 1's type ('integer')") test(681, rbindlist(list(data.table(a=letters[1:2],b=c(1.2,1.3),c=1:2), list("c",1.4,3L), NULL, list(letters[4:6],c(1.5,1.6,1.7),4:6))), data.table(a=letters[1:6], b=seq(1.2,1.7,by=0.1), c=1:6)) test(682, rbindlist(NULL), data.table(NULL)) test(683, rbindlist(list()), data.table(NULL)) test(684, rbindlist(list(NULL)), data.table(NULL)) test(685, rbindlist(list(data.table(NULL))), data.table(NULL)) # Test merge when no overlap of data in by columns when all=TRUE, #2114 DF1=data.frame(foo=letters[1:5], bar=1:5, stringsAsFactors=FALSE) DF2=data.frame(foo=letters[6:10], baz=6:10, stringsAsFactors=FALSE) DT1=as.data.table(DF1) DT2=as.data.table(DF2) test(686, merge(DF1, DF2, by="foo", all=TRUE), as.data.frame(merge(DT1,DT2,by="foo",all=TRUE))) DF1=data.frame(foo=letters[1:5], bar=1:5, stringsAsFactors=TRUE) DF2=data.frame(foo=letters[6:10], baz=6:10, stringsAsFactors=TRUE) DT1=as.data.table(DF1) DT2=as.data.table(DF2) test(687, merge(DF1, DF2, by="foo", all=TRUE), as.data.frame(merge(DT1,DT2,by="foo",all=TRUE))) # And a more basic test that #2114 revealed that factor to factor join was leaving NA in the i # factor columns, caught in 1.8.1 beta before release to CRAN. DT = data.table(a=factor(letters[1:4]), b=5:8, key="a") test(688, DT[J(factor("b"))], data.table(a=factor("b"), b=6L, key="a")) # Test removing a column followed by adding a new column using := by group, #2117 DT = data.table(a=1:3,b=4:6) DT[,b:=NULL] test(689, DT[,b:=.N,by=a], data.table(a=1:3, b=1L)) test(690, DT[,c:=2,by=a], data.table(a=1:3, b=1L, c=2)) # Test combining i with by, with particular out of order circumstances, #2118 set.seed(1) DT=data.table(a=sample(1:5,20,replace=TRUE),b=1:4,c=1:10) test(691, DT[a>2,sum(c),by=b], DT[a>2][,sum(c),by=b]) test(692, DT[a>2,sum(c),by=b%%2L], data.table(b=1:0,V1=c(34L,42L))) test(693, DT[a>2,sum(c),by=(b+1)%%2], data.table(b=c(0,1),V1=c(34L,42L))) setkey(DT,b) test(694, DT[a>2,sum(c),by=b], DT[a>2][,sum(c),by=b]) test(695, DT[a>2,sum(c),by=b%%2L], data.table(b=1:0,V1=c(34L,42L))) test(696, DT[a>2,sum(c),by=(b+1)%%2], data.table(b=c(0,1),V1=c(34L,42L),key="b")) # Test subset and %chin% crash with non-character input, #2131 test(697, 4 %chin% letters, error="type") test(698, 4L %chin% letters, error="type") test(699, "a" %chin% 4, error="type") DT = data.table(aa=1:6,bb=7:12) test(700, subset(DT,select="aa"), DT[,list(aa)]) test(701, subset(DT,select=aa), DT[,list(aa)]) test(702, subset(DT,select=c(aa)), DT[,list(aa)]) setkey(DT,aa) test(703, subset(DT,select="aa"), data.table(aa=1:6,key="aa")) test(704, subset(DT,select=aa), data.table(aa=1:6,key="aa")) test(705, subset(DT,select=c(aa)), data.table(aa=1:6,key="aa")) # Test rbinding of logical columns, #2133 DT1 = data.table(A=1:3,B=letters[1:3],C=c(TRUE,TRUE,FALSE)) DT2 = data.table(A=4:5,B=letters[4:5],C=c(TRUE,FALSE)) test(706, rbind(DT1,DT2), data.table(A=1:5, B=letters[1:5], C=c(TRUE,TRUE,FALSE,TRUE,FALSE))) test(707, rbindlist(list(DT1,DT2)), rbind(DT1,DT2)) # Test non ascii characters when passed as character by, #2134 # ***** # TO DO: reinstate. Temporarily removed to pass CRAN's Mac using C locale (R-Forge's Mac is ok) # ***** # Test := adding column after a setnames of all column names (which [,list(x)] does), #2146 DT = data.table(x=1:5)[,list(x)] test(713, DT[,y:=5], data.table(x=1:5,y=5)) setnames(DT,c("A","B")) test(714, DT[,z:=6:10], data.table(A=1:5,B=5,z=6:10)) # Test J alias is now removed outside DT[...] from v1.8.7 (to resolve rJava::J conflict) test(715, J(a=1:3,b=4), data.table(a=1:3,b=4), error="could not find function.*J") # Test get in j DT = data.table(a=1:3,b=4:6) test(716, DT[,get("b")], 4:6) # TO DO: add warning about inefficiency when datatable.pedantic=TRUE test(717, DT[,get("b"),verbose=TRUE], output="xvars being set to all columns") # Test that j can be a logical index when `with=FALSE` (#1797) DT = data.table(a=1:10, b=rnorm(10), c=letters[1:10]) test(718, DT[, c(FALSE, TRUE, FALSE), with=FALSE], DT[, 2, with=FALSE]) test(719, nrow(DT[, c(FALSE, FALSE, FALSE), with=FALSE]), 0L) # Test combining join with missing groups with group by, #2162 DT = data.table(a = 1, b = 2, c = 4, key="a") test(720, DT[list(c(5,6,7)), .N, by=b], data.table(b=NA_real_,N=3L,key="b")) test(721, DT[list(c(5,6,7))][, .N, by=b], DT[list(c(5,6,7)), .N, by=b]) test(722, DT[list(c(5,6,7)), .N, by=b, mult="first"], data.table(b=NA_real_,N=3L,key="b")) test(723, DT[list(c(5,6,7)), .N, by=b, nomatch=0], data.table(b=numeric(),N=integer(),key="b")) # Key here is correct. by is ordered (albeit empty) test(724, DT[list(c(5,6,7)), .N, by=b, nomatch=0], DT[list(c(5,6,7)),nomatch=0][,.N,by=b]) # Splitting should always be consistent # another test linked from #2162 DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1L,3L,6L), v=1:9, key="x") test(725, DT[c("a","b","d"),v][,list(v)], DT[J(c("a","b","d")),"v",with=FALSE]) # unfiled bug fix for NA matches; see NEWS 1.8.3 test(726, DT[c("a", "b", "d"), sum(v), by=y, nomatch=0], data.table(y=INT(1,3,6),V1=INT(5,7,9),key="y")) test(727, DT[c("a", "b", "d"), sum(v), by=y], data.table(y=INT(1,3,6,NA),V1=INT(5,7,9,NA))) test(728, DT[c("a", "b", "d"), sum(v), by=y], DT[J(c("a", "b", "d"))][, sum(v), by=y]) oldv = options(datatable.verbose=FALSE) test(729, capture.output(DT[c("a", "b", "d"), print(.SD)]), capture.output(suppressWarnings(DT[c("a", "b", "d"), print(.SD), by=x]))) test(729.1, capture.output(DT[c("a", "b"), print(.SD), by=y]), # TO DO: why doesn't last group have x=d, maybe groups=i in dogroups capture.output(DT[c("a", "b")][, print(.SD), by=y])) options(oldv) test(729.2, DT[c("b","d"),.SD], data.table(x=c("b","b","b","d"),y=INT(1,3,6,NA),v=INT(4,5,6,NA))) # no debate here test(729.3, DT[c("b","d"),.SD, by=y], DT[c("b","d")][,.SD, by=y][4L,x:=NA_character_]) # the i groups when no match don't get carried through (would be hard to implement this and very unlikely to be useful. Just break into compound query, if needed to be used in j, to get them to carry through. TO DO: add to FAQ. # That unnamed i gets x's join column names when j is .SD (or any named list, which verbose warns is inefficient), #2281 test(729.4, DT[c("a","b"),.SD], data.table(x=rep(c("a","b"),each=3),y=INT(1,3,6),v=1:6,key="x")) # check := when combining join with missing groups and then group by test(730, DT[c("b","a"),w:=sum(v),by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # by over a different column than was joined to test(731, DT["d",w:=99,by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # do nothing for missing group, before getting as far as type error test(732, DT["d",w:=99L,by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # do nothing for missing group test(733, DT[c("c","e","b"),w:=sum(v),by=y%%2L]$w, INT(5,7,9,24,24,15,24,24,15)) # Test column type change in the 0 row case (#2274) DT = data.table(a=1:3,b=4:6)[0] test(734, DT[,b:=as.character(b)], data.table(a=integer(),b=character())) test(735, DT[,c:=double()], data.table(a=integer(),b=character(),c=double())) # Deleting multiple columns out-of-order, #2223 DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12,e=13:15,f=16:18,g=19:21) test(736, DT[,c("b","d","g","f","c"):=NULL,with=FALSE], data.table(a=1:3,e=13:15)) # test redundant with=FALSE is ok DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12,e=13:15,f=16:18,g=19:21) test(737, DT[,c("b","d","g","f","c"):=NULL], data.table(a=1:3,e=13:15)) # with no longer needed # Mixing column adds and deletes in one := gave incorrect results, #2251. DT = data.table(c1=1:2) test(738, DT[,c("c2", "c1"):=list(c1+1L, NULL)], data.table(c2=2:3)) # `:=`(c1=v1,v2=v2,...) is now valid , #2254 DT = data.table( c1=1:3 ) test(739, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) test(740, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named") test(741, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch # that out of bounds LHS is caught, root cause of #2254 test(742, DT[,3:6:=1L], error="outside.*range") test(743, DT[,2:3:=99L], data.table(c1=1:3,c2=99L,c3=99L)) test(744, DT[,(ncol(DT)+1):=1L], error="outside.*range") test(745, DT[,ncol(DT):=1L], data.table(c1=1:3,c2=99L,c3=1L)) # multiple LHS with by without by, #2215 DT = data.table(a=letters[c(1:3,3L)],key="a") test(746, DT["a",c("new1","new2"):=list(4L, 5L)], data.table(a=letters[c(1:3,3L)],new1=INT(4,NA,NA,NA),new2=INT(5,NA,NA,NA),key="a")) test(747, DT[,new1:=4:6], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a"), warning="recycled leaving remainder of 1 item") suppressWarnings(DT[,new1:=4:6]) test(748, DT[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L)], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) # and multiple LHS by group, #1710 DT = data.table(a=rep(6:8,1:3),b=1:6) test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8,1:3),b=1:6,c=rep(1:3,1:3),d=INT(rep(c(1,5,15),1:3)),e=rep(6:8,1:3)*10L)) test(750, DT[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) # varname holding colnames, by group and with=FALSE, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" test(751, DT[,colname:=sum(b),by=a,with=FALSE], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) test(752, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) test(753, setkey(DT[,c("x1","x2"):=x],a), data.table(a=letters[1:3],x=3:1,x1=3:1,x2=3:1,key="a")) DT = data.table(a=letters[3:1],x=1:3,y=4:6) test(754, setkey(DT[,c("x1","y1","x2","y2"):=list(x,y)],a), data.table(a=letters[1:3],x=3:1,y=6:4,x1=3:1,y1=6:4,x2=3:1,y2=6:4,key="a")) # And non-recycling i.e. that a single column copy does copy the column DT = data.table(a=1:3) test(754.1, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3))) test(754.2, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6))) test(754.3, DT[,a:=as.character(a),verbose=TRUE], output="Direct plonk.*no copy") RHS = as.integer(DT$a) test(754.4, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") # Test warning on redundant by, #2282. # ** TO DO ** - add this to FAQ warnings section. DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b") test(755, DT[c("b","c"),sum(x)], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(756, DT[c("b","c"),sum(x),by=a], data.table(a=c("b","c"),V1=c(7L,9L),key="a"), warning="by.*is not necessary.*by-without-by") test(757, DT[list(c("b","c"),"d"),sum(x),by=a], data.table(a=c("b","c"),V1=2:3,key="a")) # no warning because 'by' less than number of join columns # join then by when mult=="last"|"first", #2303 (crash in dev 1.8.3 only) DT = data.table(a=1:3,b=1:6,c=7:12,key="a") test(758, DT[J(c(1L,1L)),sum(c),by=b,mult="last"], DT[J(c(1L,1L)),mult="last"][,sum(c),by=b]) test(759, DT[J(1L),c,by=b,mult="last"], DT[J(1L),mult="last"][,c,by=b]) test(760, DT[2:5,sum(c),by=b], DT[2:5][,sum(c),by=b]) test(761, DT[2:5,sum(c),by=b%%2], DT[2:5][,sum(c),by=b%%2]) # joining from empty i table, #2194 DT = data.table(a=1:3,b=4:6,key="a") test(762, DT[J(integer()),b], data.table(a=integer(),b=integer(),key="a")) test(763, DT[J(integer()),1L,by=b], data.table(b=integer(),V1=integer(),key="b")) # ordered by is detected now (empty is ordered), otherwise a join to the result would fail just because it's empty which wouldn't be consistent with non empty case test(764, DT[J(integer()),b,mult="last"], integer()) test(765, DT[J(2L),b,mult="last"], 5L) test(766, DT[J(5L),b,nomatch=0], data.table(a=integer(),b=integer(),key="a")) test(767, DT[J(5:6),b,nomatch=0], data.table(a=integer(),b=integer(),key="a")) # Crash on by-without-by with mixed type non join i columns, #2314. Despite not being used by j they were still being assigned to .BY. DT = data.table(iris,key="Species") Y = data.table(date=as.POSIXct("2011-01-01"),num=as.numeric(1:26)) Y[,get("letters"):=LETTERS] Y[,A:=1:26] Y[,p:=factor(p)] # coerce type to match DT$Species to save warning. Crash was related to .BY internally, not the coercion. setkey(Y,p) for (i in 1:10){DT[Y,Petal.Width];DT[Y];NULL} # reliable crash in 1.8.2 (tested). test(768, DT[Y,Petal.Width], data.table(Species=factor(LETTERS),Petal.Width=NA_real_,key="Species")) DT = data.table(a=1:3,b=1:6,c=7:12, key="a") test(769, DT[,.BY[[1]]==a,by=a], data.table(a=1:3,V1=TRUE,key="a")) test(770, DT[J(2:3),.BY[[1]]==b], data.table(a=INT(2,2,3,3),V1=c(TRUE,FALSE),key="a")) # A data.table RHS of := caused a crash, #2311. a = data.table(first=1:6, third=c(1,1,1,3,3,4), key="first") b = data.table(first=c(3,4,4,5,6,7,8), second=1:7, key="first") test(771, b[,third:=a[b,third]], b, warning="Supplied 2 items.*to 7.*recycled leaving remainder of 1 item") test(772, copy(b)[,third:=as.list(a[b,third])], b, warning="Supplied 2 items.*to 7.*recycled leaving remainder of 1 item") test(773, b[4,third[[1]]], c(1,3,3,3,4,NA,NA)) test(774, b[,third:=a[b,third,mult="first"]], data.table(first=c(3,4,4,5,6,7,8), second=1:7, third=c(1,3,3,3,4,NA,NA), key="first")) # This plonk replaces b with what was likely intended. # That names are dropped. (Names on the column vectors don't display. They increase size and aren't much use.) DT = data.table(a=1:3,b=LETTERS[1:3]) map = c("A"="Foo",B="Bar",C="Baz") DT[,b:=map[b]] test(775, names(DT$b), NULL) # Test that names of named vectors don't carry through, #2307. DT = data.table(a=1:3,b=c("a"="a","b"="a","c"="b")) test(776, names(DT$b), NULL) # From v1.8.11, data.table() drops vector names DT = data.table(a=1:3,b=c("a","a","b")) setattr(DT$b, "names", c("a","b","c")) # Force names in there to test #2307 test(777, names(DT$b), c("a","b","c")) test(778, DT[,sum(a),by=b], data.table(b=c("a","b"),V1=c(3L,3L))) #2307 retained names length 3 on the length 2 vector result causing it not to print. test(779, print(DT[,sum(a),by=b]), output=" b V11: a 32: b 3$") # Test new .GRP binding test(780, data.table(a=1:3,b=1:6)[,i:=.GRP,by=a][,i2:=.GRP], data.table(a=1:3,b=1:6,i=rep(1:3,2),i2=1L)) # Test new .I binding DT = data.table(a=1:4,b=1:8) test(781, DT[,.I,by=a]$.I, INT(1,5,2,6,3,7,4,8)) test(782, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=5:8)) setkey(DT,a) test(783, DT[,.I,by=a]$.I, 1:8) test(784, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=INT(2,4,6,8),key="a")) test(785, DT[J(2:4),.I,by=a%%2L], data.table(a=rep(0:1,c(4,2)),.I=INT(3,4,7,8,5,6),key="a")) test(786, DT[J(c(3,2,4)),list(.I,.GRP)], data.table(a=rep(c(3L,2L,4L),each=2),.I=INT(5,6,3,4,7,8),.GRP=rep(1:3,each=2L))) test(787, DT[J(3:2),`:=`(i=.I,grp=.GRP)][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) # New not-join (a.k.a. not-select, since not just for data.table i but integer, logical and character too) DT = data.table(A=rep(1:3,each=2),B=1:6,key="A") test(788, DT[!J(2)], data.table(A=c(1L,1L,3L,3L),B=c(1L,2L,5L,6L),key="A")) test(789, DT[!(2:6)], DT[1]) test(790, DT[!(2:6)], DT[!2:6]) # nicer than DT[-2:6] applying - to 2 first test(791, DT[!6], DT[1:5]) test(792, DT[!c(TRUE,FALSE)], DT[c(FALSE,TRUE)]) test(793, setkey(DT[,A:=letters[A]],A)[!c("b","c")], DT["a"]) test(794, DT[!"b"], DT[c("a","c")]) test(795, DT[!0], DT) test(796, DT[!NULL], DT[NULL]) test(797, DT[!integer()], DT) test(798, DT[!-1], DT[1]) test(799, DT[--1], DT[1]) myi = c("a","c") test(800, DT[!myi], DT["b"]) test(801, DT[!"c",sum(B),by=A], data.table(A=c("a","b"),V1=c(3L,7L),key="A")) test(802, DT[!"missing",sum(B),by=A], DT[,sum(B),by=A]) test(803, DT[!c("a","missing","b","missing2"),sum(B),by=A], DT["c",sum(B)]) # Combining not-join with which test(804, DT[!"b",which=TRUE], INT(1:2,5:6)) # row numbers in DT that don't match # New which=NA value test(805, DT[c("b","foo","c"),which=NA], 2L) # row numbers in i that don't match test(806, DT[!c("b","foo","c"),which=NA], c(1L,3L)) # row numbers in i that do match test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on i.*Please remove nomatch") test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6)) test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6)) test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L) test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch") # New notj for column names and positions when with=FALSE, #1384 DT = data.table(a=1:3,b=4:6,c=7:9) test(812, DT[,!"b",with=FALSE], DT[,-match("b",names(DT)),with=FALSE]) test(813, DT[,"foo",with=FALSE], error="column(s) not found: foo") test(814, DT[,!"foo",with=FALSE], data.table(NULL), warning="column(s) not removed because not found: foo") # TO DO: NULL doesn't seem right test(815, DT[,!c("b","foo"),with=FALSE], DT[,list(a,c)], warning="column(s) not removed because not found: foo") test(816, DT[,!2:3,with=FALSE], DT[,-(2:3),with=FALSE]) # for consistency, but ! is really for character column names mycols = "b" test(817, DT[,!mycols,with=FALSE], DT[,list(a,c)]) mycols = 2 test(818, DT[,!mycols,with=FALSE], DT[,list(a,c)]) # Test X[Y] slowdown, #2216 X = CJ(a=seq_len(1e3),b=seq_len(1e3)) Y = copy(X) X[4,b:=3L] # create a dup group, to force allLen1=FALSE setkey(X) test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 5) # Many minutes in 1.8.2! Now well under 1s, but 5s for wide tolerance for CRAN. We like CRAN to tell us if any changes # in R or elsewhere cause the 2 minute bug to return. Hence not excluded by an if(.devtesting) test(820, system.time(X[Y,mult="first"])["user.self"] < 5) # Optimization of lapply(,"+"), #2212 DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12) ans = data.table(a=rep(1:3,each=2L),b=INT(2,3,5,6,8,9),c=INT(8,9,11,12,14,15)) test(821, DT[,lapply(.SD, "+", a), by=a], ans) test(822, DT[,lapply(.SD, `+`, a), by=a], ans) ans = data.table(a=1:3,b=INT(4,9,14),c=INT(16,21,26)) test(823, DT[,lapply(.SD, "sum", a), by=a], ans) test(824, DT[,lapply(.SD, sum, a), by=a], ans) test(825, DT[,lapply(.SD, `sum`, a), by=a], ans) DT[2,b:=NA_integer_] test(825.1, DT[,lapply(.SD, function(x)sum(x)), by=a], data.table(a=1:3,b=INT(NA,7,11),c=INT(15,19,23))) test(825.2, DT[,lapply(.SD,function(x,...)sum(x,...),na.rm=TRUE),by=a], data.table(a=1:3,b=INT(1,7,11),c=INT(15,19,23))) test(825.3, DT[,lapply(.SD,sum,na.rm=TRUE),by=a], data.table(a=1:3,b=INT(1,7,11),c=INT(15,19,23))) # Test illegal names in merge are ok and setcolorder length error, #2193i and #2090 DT1 = data.table(a=letters[1:5], "Illegal(name%)"=1:5, key="a") DT2 = data.table(a=letters[1:5], b=6L, key="a") test(826, merge(DT1,DT2), cbind(DT1,b=6L)) test(827, merge(DT2,DT1), cbind(DT2,"Illegal(name%)"=1:5)) a=data.table('User ID'=c(1,2,3), 'Blah Blah'=c(1,2,3), key='User ID') #2090's test b=data.table('User ID'=c(1,2,3), 'Yadda Yadda'=c(1,2,3), key='User ID') test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda")) # setcolorder and merge check for dup column names, #2193(ii) setnames(DT2,"b","a") test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): a. Please remove or rename") test(829, merge(DT1,DT2), error="y has some duplicated column name(s): a. Please remove or rename") test(830, merge(DT2,DT1), error="x has some duplicated column name(s): a. Please remove or rename") # attribs such as "comments" should be retained, #2270 DT1 <- data.table(id = seq.int(1, 10), A = LETTERS[1:10], key = "id") comment(DT1$A) <- "first comment" # copies, setattr would be better as on next line DT2 <- data.table(id = seq.int(2, 10, 2), b = letters[1:5], key = "id") setattr(DT2$b,"comment","second comment") test(831, comment(DT1[DT2]$A), "first comment") test(832, comment(DT2[DT1]$b), "second comment") test(833, sapply(merge(DT1,DT2),comment), list(id=NULL, A="first comment", b="second comment")) test(834, comment(DT1[2:3]$A), "first comment") # Test that matrix RHS of := is caught, #2333 DT = data.table(a=1:3) DT[,a:=scale(a)] # 1 column matrix auto treated as vector test(835, na.omit(DT), DT) test(836, DT[,a:=as.integer(a)], data.table(a=INT(-1,0,1))) test(837, DT[,a:=cbind(1,2)], data.table(a=c(1L,2L,1L)), warning="2 column matrix RHS of := will be treated as one vector") DT = data.table(a=1:3,b=1:6) test(838, DT[,c:=scale(b), by=a][,c:=as.integer(1000*c)], data.table(a=1:3,b=1:6,c=rep(as.integer(1000*scale(1:2)), each=3))) # Test data.table's last(). (last is used internally in data.table, too). test(839, last(1:10), 10L) # If xts is loaded, this'll just test xts's last. Ok as they're consistent, for vectors. DT = data.table(a=1:3,b=4:6) test(840, last(DT), DT[3L]) # xts's last returns a one row data.table ok. So this test is ok too, whether or not xts is loaded. # But not true when DT is a one column data.table/data.frame, see below. if ("package:xts" %in% search()) { # e.g. when run via R CMD check x = xts(1:100, Sys.Date()+1:100) test(841, last(x,10), x[91:100,]) # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. # But that isn't tested by R CMD check because xts is loaded above data.table, there. # So to make this test relevant, in a fresh R session type: "require(xts);require(data.table);test.data.table()" # rather than: "require(data.table);require(xts);test.data.table()" # Which was the main thrust of bug#2312 fixed in v1.8.3 } else { cat("Test 841 not run. If required call library(xts) first.\n") # So these won't run from R CMD check (deliberately, for now) ... test(842, last(list("a",1:2,89)), 89) # xts's last returns a one item list here. Would prefer it to return the item itself. DT = data.table(a=1:3) test(842.1, last(DT), DT[3L]) # xts's last returns a 3L atomic here for 1 column data.frame, strangely. We wish for the last row, consistently. I tried # providing a last.data.table method and using Enhances and Imports in DESCRIPTION with import() and S3method() in # NAMESPACE but nothing I tried made last.data.table available to xts's last if xts was loaded above data.table (which was # frustrating to test as well, see comment to test 839 above). } # Test L[[1L]][,:=] updates by reference, #2204 l = list(data.table(a=1:3), data.table(b=4:6)) test(843, l[[2L]][,c:=7:9], data.table(b=4:6,c=7:9)) test(844, l, list(data.table(a=1:3), data.table(b=4:6,c=7:9))) names(l) = c("foo","bar") # R >= 3.1 no longer copies all the contents, yay test(845, l[["foo"]][2,d:=4], data.table(a=1:3,d=c(NA,4L,NA)), warning= if (!.R.assignNamesCopiesAll) NULL else "Invalid .internal.selfref detected and fixed") l = list(data.table(a=1:3), data.table(b=4:6)) setattr(l,"names",c("foo","bar")) test(846, l[["foo"]][2,d:=4], data.table(a=1:3,d=c(NA,4,NA))) test(847, l, list(foo=data.table(a=1:3,d=c(NA,4,NA)), bar=data.table(b=4:6))) old = getOption("datatable.alloccol") options(datatable.alloccol=2L) # the return value here seems to be TRUE rather than the old expression TO DO: follow up with r-devel l = list(foo=data.table(a=1:3,b=4:6),bar=data.table(c=7:9,d=10:12)) # list() doesn't copy the NAMED==0 objects here test(848, truelength(l[[1L]]), 2L) test(849, {l[[1L]][,e:=13:15]; l[[1L]]}, data.table(a=1:3,b=4:6)[,e:=13:15]) test(850, truelength(l[[1L]]), 102L) test(851, truelength(l[[2L]]), 2L) l[["bar"]][,f:=16:18] test(852, truelength(l[[2L]]), 102L) options(datatable.alloccol=old) # Now create the list from named objected DT1 = data.table(a=1:3, b=4:6) DT2 = data.table(c=7:9) l = list(DT1, DT2) if (!.R.listCopiesNamed) { # From R>=3.1, list() no longer copies NAMED inputs (a very welcome change in Rdevel, r63767) test(853, address(DT1) == address(l[[1L]])) w = NULL } else { test(853, address(DT1) != address(l[[1L]])) w = "Invalid .internal.selfref detected and fixed.*R's list() used to copy named objects" } test(854, l[[1]][,d:=10:12], data.table(a=1:3,b=4:6,d=10:12), warning = w) test(855, l[[1]], data.table(a=1:3,b=4:6,d=10:12)) # Test setnames on data.frame, #2273. DF = data.frame(foo=1:2,bar=3:4) setnames(DF,c("baz","qux")) test(856, DF, data.frame(baz=1:2,qux=3:4)) test(857.1, set(DF,NULL,"quux",5:6), error="set() on a data.frame is for changing existing columns, not adding new ones") test(857.2, set(DF,NULL,3L,5:6), error="set() on a data.frame is for changing existing columns, not adding new ones") test(858.1, set(DF,NULL,"qux",5:6), data.frame(baz=1:2, qux=5:6)) test(858.2, set(DF,NULL,2L,7:8), data.frame(baz=1:2, qux=7:8)) # Test DT[J(data.frame())], #2265 DT = data.table(foo=c(1,2,3), bar=c(1.1,2.2,3.3), key="foo") i = data.frame(foo=1) test(859, DT[i], DT[J(i)]) test(860, DT[i], DT[data.table(i)]) # test no memory leak, #2191 and #2284 # These take a few seconds each, and it's important to run these on CRAN to check no leak gc(); before = gc()["Vcells","(Mb)"] for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB gc(); after = gc()["Vcells","(Mb)"] test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin gc(); before = gc()["Vcells","(Mb)"] DF = data.frame(x=1:20, y=runif(20)) for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } gc(); after = gc()["Vcells","(Mb)"] test(862, after < before+0.5) gc(); before = gc()["Vcells","(Mb)"] DT = data.table(x=1:20, y=runif(20)) for (i in 1:2000) { x <- DT[1:5,]; rm(x) } gc(); after = gc()["Vcells","(Mb)"] test(863, after < before+0.5) # rbindlist should look for the first non-empty data.table test(864, rbindlist(list(data.table(foo=logical(0),bar=logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))), DT) # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) test(865, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="result of j is a named list. It's very inefficient.*removed and put back") test(866, names(ans1), c("a","b","name1","name2")) test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by="a,b"]), c("a","b","name1","name2")) # list names extracted here test(868, ans1, ans2) # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") test(869, DT[J(2,42,84),print(.SD)], output=" b1: 22: 5.*Empty data.table (0 rows) of 3 cols: a,V2,V3") # Test setnames with duplicate colnames DT = data.table(a=1:3,b=4:6,b=7:9) test(870, setnames(DT,"b","foo"), error="Some items of 'old' are duplicated (ambiguous) in column names: b") test(871, setnames(DT,c("bar","bar"),c("x","y")), error="Some duplicates exist in 'old': bar") test(872, setnames(DT,3,"c"), data.table(a=1:3,b=4:6,c=7:9)) test(873, setnames(DT,"foo","bar"), error="Items of 'old' not found in column names: foo") test(874, setnames(DT,c(1,1),c("foo","bar")), error="Some duplicates exist in 'old': 1") test(875, setnames(DT,"c","b"), data.table(a=1:3,b=4:6,b=7:9)) test(875.1, setnames(DT,"a","c"), data.table(c=1:3,b=4:6,b=7:9)) # 'a' isn't duplicated so not a problem as from v1.8.11 test(875.2, setnames(DT,c("c","b"), c("C","B")), error="Some items of 'old' are duplicated (ambiguous) in column names: b") # check error msg when 2nd one in old is the problem # Test local var problem introduced in v1.8.3 DT = data.table(a=1:3,b=1:6) f = function() { localvar = 2 print(DT[a>localvar]) print(DT[a>localvar,sum(b)]) print(DT[a>localvar,sum(b),by=a]) # bug fix 2368 } test(876, f(), output=" a b1: 3 32: 3 6.*[1] 9.* a V11: 3 9") # segfault when assigning NA names, #2393 DT = data.table(a=1:3, b=4:6) test(877, setnames(DT, c(NA, NA)), error="Passed a vector of type 'logical'. Needs to be type 'character'") # test no warning when use.names explicitly set, #2385 test(878, rbind(data.table(a=1:3,b=4:6), data.table(b=7:9,a=4:6)), data.table(a=1:6,b=4:9), warning="Argument 2 has names in a different order.*bound by name") test(879, rbind(data.table(a=1:3,b=4:6), data.table(b=7:9,a=4:6), use.names=TRUE), data.table(a=1:6,b=4:9)) # Test fread() n=110 # 110 just to be over the 100 limit for printing head, as a convenience DT = data.table( a=sample(1:1000,n,replace=TRUE), b=sample(1:1000,n,replace=TRUE)-500L, c=rnorm(n), d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE), e=rnorm(n), f=sample(1:1000,n,replace=TRUE) ) DT[2,b:=NA_integer_] DT[4,c:=NA_real_] DT[3,d:=NA_character_] DT[5,d:=""] DT[2,e:=+Inf] DT[3,e:=-Inf] DT[4,e:=NaN] # write.table writes NaN as NA, though, and all.equal considers NaN==NA. fread would read NaN as NaN if "NaN" was in file write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(880, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) test(881, fread(f), DT) DT[3,d:="NA"] test(882, fread(f,na.strings=NULL), DT) DT[3,d:=NA_character_] unlink(f) write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=TRUE) test(883, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) test(884, fread(f), DT) unlink(f) # Test short files. # All the unlinks and using a new file each time are to work around apparent Windows issues it seems when writing, appending # rereading (possibly via the MapViewOfFile) the same file that has just been appended to. These apparent issues have only # showed up on winbuilder so far, so might be in combination with the D: tempdir() there; perhaps D: is on a network drive or something. cat("",file=f<-tempfile()); test(885, fread(f), error="empty"); unlink(f) test(885.1, fread(""), error="empty") test(886, fread("\n"), error="empty") test(887, fread(" \n\t \t \n \n "), error="empty") cat("A", file=f<-tempfile()); test(888, fread(f), data.table(A=logical())); unlink(f) test(889, fread("A\n"), data.table(A=logical())) cat("AB,CDE",file=f<-tempfile()); test(890, fread(f), data.table(AB=logical(),CDE=logical())); unlink(f) test(891, fread("AB,CDE\n"), data.table(AB=logical(),CDE=logical())) cat("3.14",file=f<-tempfile()); test(892, fread(f), data.table(V1=3.14)); unlink(f) cat("A,3",file=f<-tempfile()); test(893, fread(f), data.table(V1="A",V2=3L)); unlink(f) if (.Platform$OS.type=="unix") test(893.5, fread("A,B\r\n\r\n"), data.table(A=logical(),B=logical())) for (nc in c(0,1,2)) { # 0 means all cols here for (nr in c(0,1,2,3,5,10,18,19,20,21,22,28,29,30,31,32,38,39,40,41,42)) { # 30 and 40 are trigger points for auto skip for (eol in if (.Platform$OS.type=="unix") c("\n","\r\n") else "\n") { headDT = head(DT,nr)[,seq_len(if (nc==0) ncol(DT) else nc),with=FALSE] if (nr==0) for (j in seq_len(ncol(headDT))) set(headDT,j=j,value=logical()) # when read back in empty cols are the lowest type (logical) f = tempfile() cat(names(headDT),sep=",",file=f) # no \n at the end here for (i in seq_len(nr)) { cat(eol,file=f,append=TRUE) # on unix we simulate windows too. on windows \n will write \r\n (and \r\n will write \r\r\n) write.table(headDT[i],file=f,quote=FALSE,sep=",",eol="",row.names=FALSE,col.names=FALSE,append=TRUE) # loop approach is to get no \n after last line } test(894+nr/100+nc/1000, fread(f), headDT) file.copy(f,f2<-tempfile()); unlink(f) # again trying to work around apparent issue on Windows cat(eol,file=f2,append=TRUE) # now a 'normal' file ending with \n test(895+nr/100+nc/1000, fread(f2), headDT) file.copy(f2,f3<-tempfile()); unlink(f2) cat(eol,file=f3,append=TRUE) # extra \n should be ignored test(896+nr/100+nc/1000, fread(f3), headDT) unlink(f3) }}} if ("package:bit64" %in% search()) { DT = data.table( a=sample(1:1000,n,replace=TRUE), b=sample(as.integer64(2)^35 * 1:10, n, replace=TRUE), c=sample(c("foo","bar","baz"),n,replace=TRUE) ) write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(897, class(DT$b), "integer64") test(898, fread(f), DT) unlink(f) # Test all mid read bump coercions DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] DT[,r:=a/100][,r2:=gsub(" ","",format(r))] DT[12, a2:=as.integer64(12345678901234)] # start on row 12 to avoid first 5, middle 5 and last 5 test rows DT[13, a3:=3.14] DT[14, a4:="123A"] DT[15, b2:=1234567890123.45] DT[16, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) DT[17, r2:="3.14A"] write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(899, fread(f), DT, warning="Bumped column.*to type character.*may not be lossless") unlink(f) } else { cat("Tests 897-899 not run. If required call library(bit64) first.\n") } # getwd() has been set by test.data.table() to the location of this tests.Rraw file. Test files should be in the same directory. f = "ch11b.dat" # http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat test(900, fread(f), as.data.table(read.table(f))) f = "1206FUT.txt" # a CRLF line ending file (DOS) test(901, DT<-fread(f), as.data.table(read.table(f,sep="\t",header=TRUE,colClasses=as.vector(sapply(DT,class))))) # Tests the coerce of column 23 to character on line 179 due to the 'A' for the first time : f = "2008head.csv" test(902, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE)), warning="Bumped column 23 to type character.*may not be lossless") test(903, fread("A,B\n1,3,foo,5\n2,4,barbaz,6"), data.table(1:2,3:4,c("foo","barbaz"),5:6)) # invalid header (too short) ignored test(904, fread("A,B,C,D\n1,3,foo,5\n2,4,barbaz,6"), DT<-data.table(A=1:2,B=3:4,C=c("foo","barbaz"),D=5:6)) # ok test(905, fread('A,B,C,D\n1,3,foo,5\n2,4,"barbaz",6'), DT) test(906, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,r,baz",6'), DT[2,C:="ba,r,baz"]) test(907, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,\\"r,baz",6'), DT[2,C:='ba,\\"r,baz']) # \" protected ok, but \ needs taking off too (TO DO) test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA_integer_)) test(909, fread(" Date and Time,Open,High,Low,Close,Volume 2007/01/01 22:51:00,5683,5683,5673,5673,64 2007/01/01 22:52:00,5675,5676,5674,5674,17 2007/01/01 22:53:00,5674,5674,5673,5674,42 ")$Open, c(5683L,5675L,5674L)) # , is higher than ' ' in the hierarchy of separators, so ',' is auto detected here. # blanks when testing if header row is all character test(910, fread(" 02-FEB-2009,09:55:04:962,26022009,2500,PE,36,500,44,200,11850,1100,,2865.60 02-FEB-2009,09:55:04:987,26022009,2800,PE,108.75,200,111,50,11700,1450,,2865.60 02-FEB-2009,09:55:04:939,26022009,3100,CE,31.1,3000,36.55,200,3500,5250,,2865.60 ")$V13, rep(2865.60,3)) test(911, fread("02-FEB-2009,09:55:04:962,26022009,2500,PE,36,500,44,200,11850,1100,,2865.60 02-FEB-2009,09:55:04:987,26022009,2800,PE,108.75,200,111,50,11700,1450,,2865.60 02-FEB-2009,09:55:04:939,26022009,3100,CE,31.1,3000,36.55,200,3500,5250,,2865.60")$V13, rep(2865.60,3)) # Check manually setting separator txt = "A;B;C|D,E\n1;3;4|5,6\n2;4;6|8,10\n" test(912, names(fread(txt)), c("A;B;C|D","E")) test(913, fread(txt,sep=";"), data.table(A=1:2,B=3:4,"C|D,E"=c("4|5,6","6|8,10"))) test(914, fread(txt,sep="*"), error="The supplied 'sep' was not found on line 3") test(915, fread(txt,sep="\n"), data.table("A;B;C|D,E"=c("1;3;4|5,6","2;4;6|8,10"))) # like a fast readLines # Crash bug when RHS is 0 length and := by group, fixed in 1.8.7 DT = data.table(a=1:3,b=1:6) test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA)) # roll join error when non last join column is factor, #2450 X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts') Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts') test(917, X[Y,roll=TRUE], data.table(state=factor('CA'), uid=c(1001,1001,1002,1003), ts=c(51,59,59,57), id=INT(2001,2003,2002,NA), id.1=INT(3001,3004,3003,3002),key='state,uid,ts')) # NA in join column of type double, #2453. X = data.table(name=c("Joh","Raf","Jon","Ste","Rob","Smi"),depID=c(NA,31,33,33,34,34),key="depID") Y = data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID") test(918, Y[X], data.table(depID=c(NA,31,33,33,34,34),depName=c(NA,"Sal","Eng","Eng","Cle","Cle"),name=c("Joh","Raf","Jon","Ste","Rob","Smi"),key='depID')) # Y[X] same as merge.data.frame(X,Y,all.x=TRUE) test(919, X[Y], data.table(depID=c(31,33,33,34,34,35),name=c("Raf","Jon","Ste","Rob","Smi",NA),depName=c("Sal","Eng","Eng","Cle","Cle","Mar"),key='depID')) test(920, X[Y,nomatch=0], data.table(depID=c(31,33,33,34,34),name=c("Raf","Jon","Ste","Rob","Smi"),depName=c("Sal","Eng","Eng","Cle","Cle"),key='depID')) test(921, Y[X,nomatch=0], data.table(depID=c(31,33,33,34,34),depName=c("Sal","Eng","Eng","Cle","Cle"),name=c("Raf","Jon","Ste","Rob","Smi"),key='depID')) # setnames bug on keyed table, when full vector is given and target key isn't the positions in columns 1:length(key) DT = data.table(a=1:2,b=3:4,c=5:6,key="b") test(922, setnames(DT,c("A","B","C")), data.table(A=1:2,B=3:4,C=5:6,key="B")) # vecseq overflow, crash bug #2464 DT = data.table(x=rep(1L,50000),key="x") test(923, DT[DT], error="Join results in more than 2^31 rows (internal vecseq reached physical limit). Very likely misspecified join.") X = data.table(x=1:2,y=1:6,key="x") test(924, X[J(c(1,1,1))], error="Join results in 9 rows; more than 6 = max(nrow(x),nrow(i)). Check for duplicate key values in i, each of") # sorting of 'double' columns not correct for ties (tolerance nuance in C code), #2484 DT = data.table(X=as.POSIXct( c(rep("15DEC2008:00:00:00",10),"15DEC2008:00:00:00",rep("17DEC2008:00:00:00",2)),format="%d%b%Y:%H:%M:%S"),Y=c(1534,61,74,518,519,1519,1520,1524,3127,29250,30609,43,7853)) setkey(DT,X,Y) test(925, DT[,base::order(X,Y)], 1:nrow(DT)) # Test new dogroup warning for zero length columns in result when other columns are >1, #2478 DT = data.table(a=1:3,b=1:6) test(926, DT[, if(a==2L) list(42:43,NULL) else list(42L,3.14), by=a], data.table(a=INT(1,2,2,3),V1=INT(42,42,43,42),V2=c(3.14,NA,NA,3.14)), warning="Item 2 of j's result for group 2 is zero length. This will be filled with 2 NAs to match the") test(927, DT[, if(a==2L) list(42:43,numeric()) else list(42L,3.14), by=a], data.table(a=INT(1,2,2,3),V1=INT(42,42,43,42),V2=c(3.14,NA,NA,3.14)), warning="Item 2 of j's result for group 2 is zero length. This will be filled with 2 NAs to match the") # And the root cause of #2478: that cbind(DT,1:3) created invalid data.table with empty column test(928, cbind(data.table(a=1L),b=1:3), data.table(a=1L,b=1:3)) # FR #4813 implementation resulted in changing 929 error to warning # test(929, cbind(data.table(a=1L,b=2:3),c=1:3), error="argument 1 (nrow 2) cannot be recycled without remainder to match longest nrow (3)") test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 is of size 2 but maximum size is 3") test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) DT = data.table(x=c(1,1,1,1,2,2,3),y=c(1,1,2,3,1,1,2)) DT[,rep:=1L][c(2,7),rep:=c(2L,3L)] # duplicate row 2 and triple row 7 DT[,num:=1:.N] # to group each row by itself test(931, DT[,cbind(.SD,dup=1:rep),by="num"], data.table(num=INT(1,2,2,3:7,7,7),x=c(1,1,1,1,1,2,2,3,3,3),y=c(1,1,1,2,3,1,1,2,2,2),rep=INT(1,2,2,1,1,1,1,3,3,3), dup=INT(1,1,2,1,1,1,1,1,2,3))) # New roll=+/- and rollends DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key="a,b") test(932, DT[J(c(0,2,6,8)), roll=+Inf, rollends=TRUE, v]$v, INT(1,1,6,7)) test(933, DT[J(c(0,2,6,8)), roll=-Inf, rollends=TRUE, v]$v, INT(1,2,7,7)) test(934, DT[J(c(0,2,6,8)), roll=+Inf, v]$v, INT(NA,1,6,7)) test(935, DT[J(c(0,2,6,8)), roll=-Inf, v]$v, INT(1,2,7,NA)) test(936, DT[J(c(-10,-1,2,12,13)), roll=5, rollends=TRUE, v]$v, INT(NA,1,1,7,NA)) test(937, DT[J(c(-10,-1,2,12,13)), roll=-5, rollends=TRUE, v]$v, INT(NA,1,2,7,NA)) test(938, DT[J(c(-10,2,6,7,8)), roll="nearest", v]$v, INT(1,1,7,7,7)) test(939, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=c(TRUE,FALSE), v]$v, INT(1,1,7,7,NA)) test(940, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=c(FALSE,TRUE), v]$v, INT(NA,1,7,7,7)) test(941, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=FALSE, v]$v, INT(NA,1,7,7,NA)) # merge all=TRUE with space in a y column name, #2555 X = data.table(a=1:3,b=4:6) Y = data.table(a=2:4,"d 1"=5:7) # space in Y's column name test(942, merge(X,Y,all=TRUE,by="a"), data.table(a=1:4,b=INT(4:6,NA),"d 1"=INT(NA,5:7),key="a")) test(943, merge(X,Y,all.y=TRUE,by="a"), data.table(a=2:4,b=INT(5:6,NA),"d 1"=5:7,key="a")) # Test error message say NULL rather than empty table DT = data.table(NULL) test(944, DT[,a:=1L], error = "Cannot use := to add columns to a null data.table.*You can use") DT = data.table(a=numeric()) test(945, DT[,b:=a+1], data.table(a=numeric(),b=numeric())) # fread blank column names get default names test(946, fread('A,B,,D\n1,3,foo,5\n2,4,bar,6\n'), data.table(A=1:2,B=3:4,c("foo","bar"),D=5:6)) test(947, fread('0,2,,4\n1,3,foo,5\n2,4,bar,6\n'), data.table(0:2,2:4,c("","foo","bar"),4:6)) test(948, fread('A,B,C\nD,E,F\n',header=TRUE), data.table(A="D",B="E",C=FALSE)) test(949, fread('A,B,\nD,E,F\n',header=TRUE), data.table(A="D",B="E",V3=FALSE)) # +/- with no numbers afterwards should read as character test(950, fread('A,B,C\n1,+,4\n2,-,5\n3,-,6\n'), data.table(A=1:3,B=c("+","-","-"),C=4:6)) # catching misuse of `:=` x = data.table(a=1:5) test(951, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways") # fread colClasses input = 'A,B,C\n01,foo,3.140\n002,bar,6.28000\n' test(952, fread(input, colClasses=c(C="character")), data.table(A=1:2,B=c("foo","bar"),C=c("3.140","6.28000"))) test(953, fread(input, colClasses=c(C="character",A="numeric")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(954, fread(input, colClasses=c(C="character",A="double")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(955, fread(input, colClasses=list(character="C",double="A")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(956, fread(input, colClasses=list(character=2:3,double="A")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(957, fread(input, colClasses=list(character=1:3)), data.table(A=c("01","002"),B=c("foo","bar"),C=c("3.140","6.28000"))) test(958, fread(input, colClasses="character"), data.table(A=c("01","002"),B=c("foo","bar"),C=c("3.140","6.28000"))) test(959, fread(input, colClasses=c("character","double","numeric")), data.table(A=c("01","002"),B=c("foo","bar"),C=c(3.14,6.28))) test(960, fread(input, colClasses=c("character","double")), error="colClasses is unnamed and length 2 but there are 3 columns. See") test(961, fread(input, colClasses=1:3), error="colClasses is not type list or character vector") test(962, fread(input, colClasses=list(1:3)), error="colClasses is type list but has no names") test(963, fread(input, colClasses=list(character="D")), error="Column name 'D' in colClasses..1.. not found") test(964, fread(input, colClasses=c(D="character")), error="Column name 'D' in colClasses..1.. not found") test(965, fread(input, colClasses=list(character=0)), error="Column number 0 (colClasses..1...1.) is out of range .1,ncol=3.") test(966, fread(input, colClasses=list(character=2:4)), error="Column number 4 (colClasses..1...3.) is out of range .1,ncol=3.") # Character input more than 4096 bytes (used to be passed through path.expand which imposed the limit), #2649 test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L) # Test fread warns about removal of any footer (and autostart skips up over it) test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 4.*discarded.*Rowcount: 2") test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 4.*discarded.*Rowcount: 2") test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 4.*discarded.*Rowcount: 2") # fread skip override input = "some,bad,data\nA,B,C\n1,3,5\n2,4,6\n" test(971, fread(input), data.table(some=c("A",1:2),bad=c("B",3:4),data=c("C",5:6))) test(972, fread(input, skip=1), data.table(A=1:2,B=3:4,C=5:6)) test(973, fread(input, skip=2), data.table(V1=1:2,V2=3:4,V3=5:6)) test(974, fread(input, skip=2, header=TRUE), data.table("1"=2L,"3"=4L,"5"=6L)) test(975, fread(input, skip="B"), data.table(A=1:2,B=3:4,C=5:6)) input = "\n\nA,B\n1,3\n2,4\n\nC,D\n5,7\n6,8\n\nE,F\n9,11\n10,12\n" # 3 tables in one file test(976, fread(input), data.table(E=9:10,F=11:12)) # autostart 30 finds the last one test(977, fread(input, autostart=8), data.table(C=5:6,D=7:8), warning="Stopped reading at empty line 10.*but text exists afterwards") test(978, fread(input, skip="D"), data.table(C=5:6,D=7:8), warning="Stopped reading at empty line, 2 lines after.*but text exists afterward") # mixed add and update in same `:=` bug/crash, #2528 and #2778 DT = data.table(x=rep(1:2, c(3,2)), y=6:10) DT[, z:=.GRP, by=x] # first assignment test(979, DT[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add # and example from http://stackoverflow.com/a/14732348/403310 : dt1 = fread("Date,Time,A,B 01/01/2013,08:00,10,30 01/01/2013,08:30,15,25 01/01/2013,09:00,20,20 02/01/2013,08:00,25,15 02/01/2013,08:30,30,10 02/01/2013,09:00,35,5") dt2 = fread("Date,A,B,C 01/01/2013,100,300,1 02/01/2013,200,400,2") setkey(dt1, "Date") setkey(dt2, "Date") test(980, dt1[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) DT = data.table(A=1:2,B=3:4,C=5:6) test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L)), warning="RHS 2 is length 2") DT = data.table(A=1:2,B=3:4,C=5:6) test(982, DT[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # Also note that G is not yet iterative. In future: c(12,14) # rbindlist binding factors, #2650 test(983, rbindlist(list(data.table(factor(c("A","A","B","C","A"))), data.table(factor(c("B","F","A","G"))))), data.table(V1=factor(c("A","A","B","C","A","B","F","A","G")))) test(984, rbindlist(list(data.table(factor(c("A","B"))), data.table(c("C","A")))), data.table(factor(c("A","B","C","A")))) test(985, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C","A"))))), data.table(factor(c("A","B","C","A")))) # with NA test(985.1, rbindlist(list(data.table(factor(c("A","B"))), data.table(factor(c("C",NA))))), data.table(factor(c("A","B","C",NA)))) test(985.2, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C",NA))))), data.table(factor(c("A","B","C",NA)))) ## Allow unique/duplicated to accept custom colum combination to query for ## uniqueness dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") df <- as.data.frame(dt) test(986, unique(dt), dt[!duplicated(df[, key(dt)]),]) test(987, unique(dt, by='A'), dt[!duplicated(df[, 'A'])]) test(988, unique(dt, by='B'), dt[!duplicated(df[, 'B'])]) test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])]) test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])]) test(991, unique(dt, by=NULL), dt[!duplicated(df)]) test(991.1, unique(dt, by=4), error="Integer values between 1 and ncol are required") test(991.2, unique(dt, by=c(1,3.1)), error="Integer values between 1 and ncol are required") test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])]) test(991.4, unique(dt, by=c('C','D','E')), error="by specifies column names that do not exist. First 5: D,E") # :=NULL on factor column in empty data.table, #4809 DT = data.table(A = integer(), B = factor()) test(992, DT[, B:=NULL], data.table(A=integer())) # That including FUN= works in j=lapply, #4839 DT = as.data.table(iris) test(993, DT[, lapply(.SD, function(x) sum(!is.na(x), na.rm=TRUE)), by = Species], DT[, lapply(.SD, FUN=function(x) sum(!is.na(x), na.rm=TRUE)), by = Species]) # fread more than 50,000 columns, the R_PPSSIZE limit in Defn.h # Takes too long for routine use. TO DO: move to a long running stress test script #M = matrix(1,nrow=3,ncol=200000) #f = tempfile() #write.csv(M,f,row.names=FALSE) #test(994, fread(f)[[200000]], rep(1L,3)) #unlink(f) # CJ with `sorted = FALSE` option DT <- data.table(x=rep(3:5, each=4), y=rep(1:6, each=2), z=1:12) setkey(DT, x, y) OUT <- DT[J(c(5,5,3,3), c(5,1,5,1))] test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT) # CJ with ordered factor xx <- factor(letters[1:2], ordered=TRUE) yy <- sample(2) test(996, CJ(xx, yy), setkey(data.table(rep(xx, each=2), rep(base::sort.int(yy), 2)))) # That CJ orders NA consistently with setkey and historically, now it doesn't use setkey. # NA must always come first in data.table throughout, since binary search relies on that internally. test(997, DT <- CJ(c(1,3,NA,2), 5:6), setkey(setkey(copy(DT),NULL))) # double setkey to really rebuild key test(998, DT <- CJ(as.integer(c(1,3,NA,2)), 5:6), setkey(setkey(copy(DT),NULL))) test(999, DT <- CJ(c("A","B",NA,"C"), 5:6), setkey(setkey(copy(DT),NULL))) test(1000, DT <- CJ(c(1,NA,3), c("B",NA,"A"), c(5L,NA_integer_)), setkey(setkey(copy(DT),NULL))) test(1001, DT <- CJ(c(1,NA,3)), setkey(setkey(copy(DT),NULL))) # The 1 column case is switched inside CJ() so test that too. # merge all=TRUE when y is empty, #2633 a = data.table(P=1:2,Q=3:4,key='P') b = data.table(P=2:3,R=5:6,key='P') test(1002, merge(a,b[0],all=TRUE), data.table(merge.data.frame(a,b[0],all=TRUE),key='P')) a = data.table(c=c(1,2),key='c') b = data.table(c=3,key='c') test(1003, merge(a,b[0],all=TRUE), data.table(merge.data.frame(a,b[0],all=TRUE),key='c')) # setkey with backticks, #2452 DT = data.table("Date and Time"=1:3,x=4:6) test(1004, setkey(copy(DT),`Date and Time`), setkey(DT,"Date and Time")) # rbinding with duplicate names, NA or "", #2384 and #2726 DT = data.table(a=1:3,b=4:6,b=7:9,c=10:12) test(1005, rbind(DT,DT), data.table(a=rep(1:3,2),b=rep(4:6,2),b=rep(7:9,2),c=rep(10:12,2))) M <- mtcars colnames(M)[11] <- NA test(1006, print(as.data.table(M), nrows=10), output="gear NA.*1: 21.0") # rbinding factor with non-factor/character DT1 <- data.table(x=1:5, y=factor("a")) DT2 <- data.table(x=1:5, y=2) test(1007, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2')))) test(1008, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5))))) # rbindlist different types DT1 <- data.table(a = 1L, b = 2L) DT2 <- data.table(a = 2L, b = 'a') DT3 <- data.table(a = 2L, b = 2.5) test(1008.1, rbindlist(list(DT1, DT2)), data.table(a = c(1L,2L), b = c('2', 'a'))) test(1008.2, rbindlist(list(DT1, DT3)), data.table(a = c(1L,2L), b = c(2, 2.5))) # optimized mean() respects na.rm=TRUE by default, as intended DT = data.table(a=c(NA,NA,FALSE,FALSE), b=c(1,1,2,2)) test(1009, DT[,list(mean(a), sum(a)),by=b], data.table(b=c(1,2),V1=c(NA,0),V2=c(NA,0))) # an fread error shouldn't hold a lock on the file on Windows. f = tempfile() cat('A,B\n"aa",2\n"bb,2\n"cc",3\n', file=f) test(1010, fread(f), error="A field starting with quote.*doesn't end with a quote on this line.*bb,2") cat('"dd",3\n',file=f,append=TRUE) # testing file lock after error test(1011, fread(f), error="A field starting with quote.*doesn't end with a quote on this line.*bb,2") cat('A,B\n"aa",1\n"bb",2\n"cc",3\n', file=f) # testing overwrite test(1012, fread(f), data.table(A=c("aa","bb","cc"),B=1:3)) unlink(f) # testing file can be removed after error # integer64 control to fread test(1013, fread("A,B\n123,123\n", integer64="integer"), error="integer64='%s' which isn't 'integer64'|'double'|'numeric'|'character'") test(1014, fread("A,B\n123456789123456,21\n", integer64="character"), data.table(A="123456789123456",B=21L)) test(1015, fread("A,B\n123456789123456,21\n", integer64="double"), data.table(A=as.double("123456789123456"),B=21L)) # and that mid read bumps respect integer64 control too .. x = sample(1:1000,100,replace=TRUE) DT = data.table( A=as.character(x), B=1:100) DT[15, A:="123456789123456"] # row 15 is outside the top, middle and last 5 rows. write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(1016, fread(f,integer64="numeric"), copy(DT)[,A:=as.numeric(A)]) test(1017, fread(f,integer64="character"), DT, warning="Bumped column.*to type character.*may not be lossless") unlink(f) # ERANGE warning, #4879 test(1018, fread("1.46761e-313\n"), data.table(V1=as.numeric("1.46761e-313")), warning="strtod() returned ERANGE") test(1019, fread("1.23456789123456789123456789\n"), data.table(V1=as.numeric("1.23456789123456789123456789"))) # no warning, as standard # crash assigning to row 0, #2754 DT = data.table(A=1:5,B=6:10) test(1020, DT[0,A:=6L], error="i[[]1[]] is 0 which is out of range [[]1,nrow=5[]]") test(1021, DT[NA,A:="foo"], error="i[[]1[]] is NA. Can't assign by reference to row 'NA'") test(1022, DT[5:0,A:=21L], error="i[[]6[]] is 0 which is out of range [[]1,nrow=5[]]") test(1023, DT[c(1,2,NA,3), B:="bar"], error="i[[]3[]] is NA") test(1024, DT[6,A:=0L], error="i[[]1[]] is NA. Can't") # TO DO: earlier point at R level is replacing 6 with NA? # crash assigning to duplicated column names/numbers, #2751 test(1024.1, DT[,c("B","B"):=NULL], error="Can't assign to the same column twice in the same query (duplicates detected).") test(1024.2, DT[,c(1,2,1):=NULL], error="Can't assign to the same column twice in the same query (duplicates detected).") # as.data.table.table, #4848 DF <- data.frame(x = c(1,1,2,NA,1,2), y = c("b", "b", "b", "a", "c", "a"), z = c(1,1,1,1,1,2), stringsAsFactors=FALSE ) tab1 <- as.data.table(as.data.frame(table(DF$x), stringsAsFactors=FALSE)); setattr(tab1, 'names', c("V1", "N")) tab2 <- as.data.table(as.data.frame(table(DF$x, DF$y), stringsAsFactors=FALSE)); setattr(tab2, 'names', c("V1", "V2", "N")) tab3 <- as.data.table(as.data.frame(table(DF$x, DF$y, DF$z), stringsAsFactors=FALSE)); setattr(tab3, 'names', c("V1", "V2", "V3", "N")) test(1025, as.data.table(table(DF$x)), tab1) test(1026, as.data.table(table(DF$x, DF$y)), tab2) test(1027, as.data.table(table(DF$x, DF$y, DF$z)), tab3) # catch printing of data.table(table()), #4847 (as.data.table should be used instead) test(1027.1, print(data.table(table(1:99))), output="99: 1") # data.table() and rbindlist() in v1.8.11 now catch and removes the dim attribute. For it on to test print catches it : test(1027.2, {DT<-data.table(table(1:99));setattr(DT[[1]],"dim",99L);print(DT)}, error="Invalid column: it has dimensions. Can't format it. If it's the result of data.table(table()), use as.data.table(table()) instead.") # as.data.table.x where x is integer, numeric, etc... set.seed(45) test(1028, as.data.table(x<-sample(5)), data.table(V1=x)) test(1029, as.data.table(x<-as.numeric(x)), data.table(V1=x)) test(1030, as.data.table(x<-as.Date(x, origin="2013-01-01")), data.table(V1=x)) test(1031, as.data.table(x<-factor(sample(5))), data.table(V1=x)) test(1032, as.data.table(x<-factor(x, ordered=TRUE)), data.table(V1=x)) test(1033, as.data.table(x<-as.logical(sample(0:1, 5, TRUE))), data.table(V1=x)) test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) # melt.data.table if ("package:reshape2" %in% search()) { try(detach(package:reshape),silent=TRUE) # otherwise when reshape is loaded, can't access melt.data.table from reshape2. # try() in case reshape isn't loaded (detach gives not found error in that case) set.seed(45) DT <- data.table( i1 = c(1:5, NA), i2 = c(NA,6,7,8,9,10), f1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), c1 = sample(c(letters[1:3], NA), 6, TRUE), d1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), d2 = as.Date(6:1, origin="2012-01-01")) DT[, l1 := DT[, list(c=list(rep(i1, sample(5,1)))), by = i1]$c] # generate list cols DT[, l2 := DT[, list(c=list(rep(c1, sample(5,1)))), by = i1]$c] test(1035, melt(DT, id=1:2, measure=3:4), melt(DT, id=c("i1", "i2"), measure=c("f1", "c1"))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l1")) ans1[, value := DT$l1] test(1036, melt(DT, id=c("i1", "i2", "l2"), measure=c("l1")), ans1) ans2 = data.table(c1=DT$c1, variable=rep(c("d1", "d2"), each=6), value=as.numeric(c(DT$d1, DT$d2)))[!is.na(value)] test(1037, melt(DT, id=4, measure=5:6, na.rm=TRUE, variable.factor=FALSE), ans2) DT2 <- data.table(x=1:5, y=1+5i) # unimplemented class test(1038, melt(DT2, id=1), error="Unknown column type 'complex'") # TO DO: more tests } # sorting and grouping of Inf, -Inf, NA and NaN, #4684, #4815 & #4883 DT <- data.table(x = rep(c(1,NA, NaN, Inf, -Inf), each=2)) OUT <- data.table(x=c(1,NA, NaN, Inf, -Inf), N=2L) test(1039, DT[, .N, by=x], OUT) DT <- data.table(y =c(NA, Inf, NA, -Inf, -Inf, NaN, Inf, 1, NaN, 1)) OUT <- data.table(y = c(NA, Inf, -Inf, NaN, 1), N=2L) test(1040, DT[, .N, by=y], OUT) # rbindlist on *data.frame* input, #4648. Somehow not test for this. (Although, #4648 was the same as #2650 fixed in v1.8.9). l <- list(u1=data.frame(i1=c('a', 'b', 'c'), val=1:3, stringsAsFactors=TRUE), u2=data.frame(i1=c('d', 'e'), val=4:5, stringsAsFactors=TRUE)) test(1041, rbindlist(l), data.table(i1=factor(letters[1:5]),val=1:5)) # negative indexing in *i* leads to crash/wrong aggregates when dogroups is called. bug #2697 DT = data.table(x = c(1,2,3,4,5), group = c(1,1,2,2,3)) test(1042, DT[-5, mean(x), by = group], data.table(group=1:2, V1=c(1.5, 3.5))) # Test when abs(negative index) > nrow(dt) - should warn test(1042.1, DT[-10], DT, warning="do not exist to be removed") test(1042.2, DT[c(-5, -10), mean(x), by = group], data.table(group=1:2, V1=c(1.5, 3.5)), warning="do not exist to be removed") # Test #1043 TO DO - mixed negatives test(1043, DT[c(1, -5)], error="only 0's may be mixed with negative subscripts") # crash (floating point exception), when assigning null data.table() to multiple cols, #4731 DT = data.table(x=1:5,y=6:10) test(1044, DT[3,c("x","y"):=data.table()],error="Supplied 2 columns to be assigned an empty list.*use NULL instead.*list(list())") test(1045, DT[3,c("x","y"):=list()],error="Supplied 2 columns to be assigned an empty list.*use NULL instead.*list(list())") # negative indexing with head() and tail(). bug #2375 d1 = data.table(date = c(1,2,3,4,5), value = c(1,2,3,4,5)) d2 = data.frame(d1) test(1046, head(d1, -2), as.data.table(head(d2, -2))) test(1047, head(d1, 2), as.data.table(head(d2, 2))) test(1048, head(d1, -10), as.data.table(head(d2, -10))) test(1049, head(d1, 10), as.data.table(head(d2, 10))) test(1050, tail(d1, -2), as.data.table(tail(d2, -2))) test(1051, tail(d1, 2), as.data.table(tail(d2, 2))) test(1052, tail(d1, -10), as.data.table(tail(d2, -10))) test(1053, tail(d1, 10), as.data.table(tail(d2, 10))) # negative indexing with `:=` - new feature through fixing of #2697, performs as intended for negative subscripts, throws an error for 0. x <- data.table(letters=letters[1:5], number=1:5) test(1054, x[-(1:3), number := 1L], x[4:5, number := 1L]) test(1055, x[0, number := 1L], error="which is out of range") # print.data.table heeds digits=2 etc, #2535 DT = data.table(x=rep(c("a","b","c"),each=3), y=(30/7)^(2:10))[, logy := log(y)] test(1056, print(DT, digits=2), output=" x y logy1: a 18 2.92: a 79 4.43: a 337 5.8") test(1057, print(DT, digits=2, big.mark=","), output=" x y logy1: a 18 2.9.*6: b 26,556 10.27: c 113,811 11.6") # bug #2758 fix - segfault with zeros in i and factors in by x <- data.table(letters=letters[1:5], factor=factor(letters[1:5]), number=1:5) test(1058, x[c(0, 3), list(letters, number), by=factor], error="While grouping, i=0 is allowed") test(1059, x[c(3, 0), list(letters, number), by=factor], error="While grouping, i=0 is allowed") test(1060, x[c(0, 3), number := 5L, by=factor], error="While grouping, i=0 is allowed") test(1061, x[c(0, 3), number := 5L], error="which is out of range") # to make sure that `:=` throws correct error without by. # bug #2440 fix - seqfault when j refers to grouping variable when results are empty DT = data.table(x=rep(c("a","b"),each=3),v=c(42,42,42,4,5,6)) test(1062, DT[x %in% c('z'),list(x2=x),by=x], output="Empty data.table (0 rows) of 2 cols: x,x2") test(1063, DT[x %in% c('z'),list(vpaste=paste(v,collapse=','),x2=paste(x,x)),by=x], output="Empty data.table (0 rows) of 3 cols: x,vpaste,x2") test(1064, DT[integer(0), list(x2=x), by=x], output="Empty data.table (0 rows) of 2 cols: x,x2") # bug #2445 fix - := fails when subsetting yields NAs and with=FALSE X = data.table(A=1:3, B=1:6, key="A") var <- "B" test(1065, X[J(2:5), var:=22L, with=FALSE], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) # fread single unnamed colClasses f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n" test(1066, fread(f,colClasses=c("integer","integer","character")), error="colClasses is unnamed and length 3 but there are 4 columns") test(1067, fread(f,colClasses=c("integer","numeric","character","character")), data.table(A=1:2,B=c(3,4),C=c("5","6"),D=c("7","8"))) test(1068, fread(f,colClasses="character"), data.table(A=c("1","2"),B=c("3","4"),C=c("5","6"),D=c("7","8"))) # fread select and drop test(1069, fread(f,drop=c("D","B")), data.table(A=1:2,C=5:6)) test(1070, fread(f,drop="E"), fread(f), warning="Column name 'E' in 'drop' not found") test(1071, fread(f,select="B",colClasses=list(numeric="C")), data.table(B=3:4)) test(1072, fread(f,select="B",drop="C"), error="not both") test(1073, fread(f,drop=2:3), fread(f,select=c(1,4))) # tests coercing numeric select as well # that problem printing duplicate columns doesn't return, #4788 DT = data.table(V1 = c(1:1000), V2 = c(10001:11000)) test(1074, DT[, sum(V2), by = V1], output="1000: 1000 11000") # x has two columns both called V1 here # add test from #2446. Already fixed but add anyway. "names in neworder not found in x: 'colnames with spaces' from merge() when all.y=TRUE" X = data.table(a=1:3,b=4:6,"c d"=7:9) Y = data.table(e=10:12,a=2:4) test(1075, merge(X,Y,by="a",all=TRUE), data.table(a=c(1:4),b=c(4:6,NA),"c d"=c(7:9,NA),e=c(NA,10:12),key="a")) # Fixes #2670. `by` sometimes incorrect for expressions of keyed columns. When by is used like `by=month(date)`, with key column set to "date", grouping+aggregation would be wrong. DT = data.table(date=as.Date("2013-01-01")+seq(1,1000,by=10),1:100) setkey(DT,date) test(1076, DT[,sum(V2),by=month(date)], DT[, sum(V2), by=list(month(date))]) # just to be sure, second test with another function using sample. setkey(DT, V2) ff <- function(x) { set.seed(45); (sample(x)-1) %/% 10} test(1077, DT[, sum(V2),by=ff(V2)], DT[, sum(V2),by=list(ff(V2))]) # rbindlist should discard names on columns, #4890 d = data.frame(x=1:5) f = function(x) {suppressWarnings(DF<-data.frame(x=x, y=1:10)); setattr(DF$x,"names","a");DF} l = apply(d, 1, f) test(1078.1, length(names(l[[1]]$x)), 10) # test this test is creating names on the column test(1078.2, length(names(l[[2]]$x)), 10) a = rbindlist(l) test(1078.3, a$x, rep(1:5,each=10)) # a$x would segfault before the fix to rbindlist # data.table() shouldn't retain column names, root cause of #4890 x = 1:5 names(x) = letters[1:5] test(1079.1, DF<-data.frame(x=x, y=1:10), data.frame(x=rep(1:5,2),y=1:10), warning="row names.*discarded") test(1079.2, lapply(DF, names), list(x=NULL, y=NULL)) test(1079.3, DT<-data.table(x=x, y=1:10), data.table(x=rep(1:5,2),y=1:10)) test(1079.4, lapply(DT, names), list(x=NULL, y=NULL)) # test from similar #4912 for completeness z = c(a=1,b=2,c=3) a = data.table(z,x=1:3) b = rbind(a, data.table(z=2,x=1)) test(1080, b$z, c(1,2,3,2)) # mid row logical detection test(1081, fread("A,B,C\n1,T,2\n"), data.table(A=1L,B=TRUE,C=2L)) # cartesian join answer's key should contain only the columns considered in binary search. Fixes #2677 set.seed(45) n <- 10 DT1 <- data.table(a=sample(1:3, n, replace=TRUE), b=sample(1:3, n, replace=TRUE), c=sample(1:10, n,replace=TRUE), key=c("a", "b", "c")) DT2 <- data.table(p=sample(1:3, n, replace=TRUE), q=sample(1:3, n, replace=TRUE), r=sample(1:n), w=sample(1:n)) setkey(DT2, p,q) ans <- DT1[DT2, nomatch=0, allow.cartesian=TRUE] test(1082, ans, setkeyv(ans, key(ans))) # if key is not set properly, there should be a warning and re-keying that'll be caught. check <- setkey(as.data.table(aggregate(r ~a+b+c, ans, length)), a, b) test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the key is set properly, then and only then will the aggregation results match with "check" # Tests for #2531. `:=` loses POSIXct or ITime attribute: # first test from this SO post: http://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] dt[, min.group.date := as.IDate(min(date)), by = group] test(1084, class(dt$min.group.date), c("IDate", "Date")) dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] dt[, min.group.date := min(date), by = group] # don't need to wrap it with as.IDate(.) test(1085, class(dt$min.group.date), c("IDate", "Date")) # second test from this SO post: http://stackoverflow.com/questions/14604820/why-does-this-posixct-or-itime-loses-its-format-attribute DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) DT[,x1:=as.ITime(x)] DT[,`:=`(last.x=tail(x,1L),last.x1=tail(x1,1L)),by=y] test(1086, class(DT$last.x), c("POSIXct", "POSIXt")) test(1087, class(DT$last.x1), "ITime") # chmatch on 'unknown' encoding (e.g. as.character(as.symbol("ä")) )falling back to match, #2538 and #4818 x1 <- c("alä", "ala", "äallc", "coep") x2 <- c("ala", "alä") test(1088.1, chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" test(1088.2, x1 %chin% x2, x1 %in% x2) # change x1 to symbol to character x3 <- unlist(lapply(x1, function(x) as.character(as.name(x))), use.names=FALSE) test(1089.1, chmatch(x3, x2), match(x3, x2)) # should fallback to match in "x" test(1089.2, x3 %chin% x2, x3 %in% x2) # should fallback to match in "x" # change x2 to symbol to character x4 <- unlist(lapply(x2, function(x) as.character(as.name(x))), use.names=FALSE) test(1090.1, chmatch(x1,x4), match(x1, x4)) # should fallback to match in "table" test(1090.2, x1 %chin% x4, x1 %in% x4) # both are symbols to characters test(1091.1, chmatch(x3, x4), match(x3, x4)) # should fallback to "match" in "x" as well. test(1091.2, x3 %chin% x4, x3 %in% x4) # for completness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) setnames(DT, "pas", "päs") test(1092, DT[is.na(päs), päs := 99L], data.table("päs" = c(1:5, 99L, 6:10), good = c(1:10,NA))) test(1093, DT[, päs := 34L], data.table(päs = 34L, good=c(1:10,NA))) # print of unnamed DT with >20 <= 100 rows, #4934 DT <- data.table(x=1:25, y=letters[1:25]) DT.unnamed <- unname(copy(DT)) test(1094, print(DT.unnamed), output="NA NA 1: 1 a 2: 2 b 3: 3 c") # DT[!TRUE] or DT[!TRUE, which=TRUE], #4930. !TRUE still can be a recycling operation with !(all TRUE) DT <- data.table(x=1:3, y=4:6) test(1095.1, DT[!TRUE], DT[FALSE]) test(1095.2, DT[!TRUE, which=TRUE], DT[FALSE, which=TRUE]) ######### incremented tests by 1 as I've used 1096 for FR #2077 (above along with already existing tests 522): ########### # roll backwards when i is keyed and rollends=FALSE # http://stackoverflow.com/questions/18984179/roll-data-table-with-rollends dt1 = data.table(Date=as.Date(c("2013-01-03","2013-01-07")),key="Date")[,ind:=.I] dt2 = data.table(Date=seq(from=as.Date("2013-01-01"),to=as.Date("2013-01-10"), by="1 day"),key="Date") test(1097, dt1[dt2,roll=-Inf,rollends=FALSE]$ind, INT(NA,NA,1,2,2,2,2,NA,NA,NA)) # now ok test(1098, dt1[dt2,roll=-Inf,rollends=TRUE]$ind, INT(1,1,1,2,2,2,2,2,2,2)) # ok before test(1099, dt1[dt2,roll=-Inf,rollends=c(TRUE,FALSE)]$ind, INT(1,1,1,2,2,2,2,NA,NA,NA)) # ok before test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2,2,2)) # now ok # dcast.data.table if ("package:reshape2" %in% search()) { try(detach(package:reshape),silent=TRUE) # otherwise when reshape is loaded, can't access melt.data.table from reshape2. # try() in case reshape isn't loaded (detach gives not found error in that case) names(ChickWeight) <- tolower(names(ChickWeight)) DT <- melt(as.data.table(ChickWeight), id=2:4) # calls melt.data.table # no S3 method yet, have to use "dcast.data.table" test(1101, as.data.frame(dcast.data.table(DT, time ~ variable, fun=mean)), dcast(DT, time~variable, fun=mean)) test(1102, as.data.frame(dcast.data.table(DT, diet ~ variable, fun=sum)), dcast(DT, diet~variable, fun=sum)) x1 <- as.data.frame(dcast.data.table(DT, diet+chick ~ time, drop=FALSE)) x1$chick <- factor(x1$chick, levels=levels(x1$chick), ordered=FALSE) x2 <- dcast(DT, diet+chick~time, drop=FALSE) test(1103, x1,x2) x1 <- as.data.frame(dcast.data.table(DT, diet+chick ~ time, drop=FALSE, fill=0)) x1$chick <- factor(x1$chick, levels=levels(x1$chick), ordered=FALSE) x2 <- dcast(DT, diet+chick~time, drop=FALSE, fill=0) test(1104, x1,x2) } # test for freading commands x1 <- data.table(a = c(1:5), b = c(1:5)) f <- tempfile() write.csv(x1, f, row.names = FALSE) if (.Platform$OS.type == "unix") { test(1105, x1[a != 3], fread(paste('grep -v 3 ', f, sep=""))) } else { # x2 <- fread(paste('more ', f, sep="")) # Doesn't work on winbuilder. Relies on 'more' available in DOS via Cygwin? # Error: # Syntax error: end of file unexpected (expecting ")") # Error: (converted from warning) running command 'sh.exe -c (more D:\temp\RtmpgB8D2P\file1ed828a511cd) > D:\temp\RtmpgB8D2P\file1ed84f9f44f8' had status 2 # test(1105, x1, x2) } # test for "key" argument of [.data.table #x1 <- data.table(a = c(1:5), b = c(5:1)) #x1[J(2), key = 'a'] #test(1106, key(x1) == 'a') #x1[, a, key = NULL] #test(1107, is.null(key(x1))) # test that eval works inside expressions DT <- data.table(a = c(1:5)) s <- quote(a) test(1108, DT[, sum(eval(s))], DT[, sum(a)]) # test that boolean expression does not trigger a not-join DT <- data.table(a = 1:3, b = c(TRUE,FALSE,NA)) test(1109, DT[b != TRUE], DT[!(b == TRUE)]) # commented for now (by Arun) # # test the speed of simple comparison # DT <- data.table(a = 1:1e7) # t1 = system.time(DT[a == 100])[3] # t2 = system.time(DT[which(a == 100)])[3] # # make sure we're at most 30% slower than "which" (should pass most of the time) # test(1110, (t1 - t2)/t2 < 0.3) # test that a column named list is ok (this also affects other functions in by, might be worth adding a test for that) DT <- data.table(list = 1:6, a = 1:2) test(1111, DT[, lapply(.SD, sum), by = a], DT[, list(list = sum(list)), by = a]) # fix for #4995. "rbind" retains key when the first argument isn't a data.table (.rbind.data.table is never run is the issue) DT <- data.table(name=c('Guff','Aw'),id=101:102,id2=1:2,key='id') y <- rbind(list('No','NON',0L),DT,list('Extra','XTR',3L)) test(1112, key(y), NULL) # fix for http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by # where, .SD loses class information. format.myclass <- function(x, ...){ paste("!!", x, "!!", sep = "") } DT <- data.table(L = rep(letters[1:3],3), N = 1:9) setattr(DT$N, "class", "myclass") test(1113, class(DT[, .SD, by = L]$N), class(DT$N)) setkey(DT, L) test(1114, class(DT[, .SD, by = L]$N), class(DT$N)) test(1115, class(DT[J(unique(L)), .SD]$N), class(DT$N)) # Fix for #4994 - not-join quoted expression dint work... dt = data.table(a = 1:2, key = 'a') dt1 = data.table(a = 1) expr = quote(!dt1) test(1116, dt[eval(expr)], dt[2]) expr = quote(!1) test(1117, dt[eval(expr)], dt[2]) # Fix for #2381 - optimisation of `DT[, lapply(.SD, function(x) FUN(x, bla)), by=key(DT)]` where "bla" is a column in DT dint work. set.seed(45) dt <- data.table(x=rep(1:4, each=4), b1=sample(16), b2=runif(16)) setkey(dt, x) test(1118, dt[, lapply(.SD, function(y) weighted.mean(y, b2, na.rm=TRUE)), by=x], dt[, lapply(.SD, weighted.mean, b2, na.rm=TRUE), by=x]) # a(nother) test of #295 DT <- data.table(x=5:1, y=1:5, key="y") test(1119, is.null(key(DT[, list(z = y, y = 1/y)]))) ## various ordered factor rbind tests DT = data.table(ordered('a', levels = c('a','b','c'))) DT1 = data.table(factor('a', levels = c('b','a','f'))) DT2 = data.table(ordered('b', levels = c('b','d','c'))) DT3 = data.table(c('foo', 'bar')) DT4 = data.table(ordered('a', levels = c('b', 'a'))) test(1120, rbind(DT, DT1, DT2, DT3), data.table(ordered(c('a','a','b', 'foo', 'bar'), levels = c('a','b','d','c','f', 'foo', 'bar')))) test(1121, rbindlist(list(DT, DT1, DT2, DT3)), data.table(ordered(c('a','a','b', 'foo', 'bar'), levels = c('a','b','d','c','f', 'foo', 'bar')))) test(1122, rbind(DT, DT4), data.table(factor(c('a','a'), levels = c('a','b','c'))), warning="ordered factor levels cannot be combined, going to convert to simple factor instead") test(1123, rbindlist(list(DT, DT4)), data.table(factor(c('a','a'), levels = c('a','b','c'))), warning="ordered factor levels cannot be combined, going to convert to simple factor instead") test(1124, rbind(DT1, DT1), data.table(factor(c('a','a'), levels = c('b','a','f')))) test(1125, rbindlist(list(DT1, DT1)), data.table(factor(c('a','a'), levels = c('b','a','f')))) ## test rbind(..., fill = TRUE) DT = data.table(a = 1:2, b = 1:2) DT1 = data.table(a = 3:4, c = 1:2) test(1126, rbind(DT, DT1, fill = TRUE), data.table(a = 1:4, b = c(1, 2, NA, NA), c = c(NA, NA, 1, 2))) ## check for #4959 - rbind'ing empty data.table's DT = data.table(a=character()) #test(1127, rbind(DT, DT), DT) ## check for #5005 DT = data.table(a=0:2,b=3:5,key="a") test(1128, DT[, (function(){b})()], DT[, b]) ## Fix for FR #4867 DT <- data.table(x=1:5, y=6:10) test(1129.1, DT[, as.factor(c("x", "y")), with=FALSE], DT) test(1129.2, DT[, as.factor(c("x", "x")), with=FALSE], DT[, list(x, x)]) # Fix for a specific case that results in error in `construct` function in data.table.R (found and fixed during #5007 bug fix) MyValueIsTen <- 10 set.seed(1) DT <- data.table(ID=sample(LETTERS[1:3], 6, TRUE), Value1=rnorm(6), Value2=runif(6)) cols <- c("Value1", "Value2") DT2 <- copy(DT) test(1130, DT[, (cols) := lapply(.SD, function(x) MyValueIsTen), by=ID], DT2[, (cols) := 10]) # Fix for #5007 - The value MyValueIsTen = 10 was never recognised (value within the function environment) MyValueIsTen <- 5 set.seed(1) DT <- data.table(ID=sample(LETTERS[1:3], 6, TRUE), Value1=rnorm(6), Value2=runif(6)) My_Fun <- function(x=copy(DT)) { MyValueIsTen <- 10 cols <- c("Value1", "Value2") x[, (cols) := lapply(.SD, function(x) MyValueIsTen), by=ID] } DT[, (cols) := 10] test(1131, My_Fun(), DT) # Test for #4957 - where `j` doesn't know `.N` when used with `lapply(.SD, function(x) ...)` test(1132, DT[, lapply(.SD, function(x) .N), by=ID], data.table(ID=c("A", "B", "C"), Value1=2L, Value2=2L)) # Test for #4990 - `:=` does not generate recycling warning during 'by': DT <- data.table(x=c(1,1,1,1,1,2,2)) # on a new column test(1133.1, DT[, new := c(1,2), by=x], data.table(x=c(1,1,1,1,1,2,2), new=c(1,2,1,2,1,1,2)), warning="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") # on an already existing column test(1133.2, DT[, new := c(1,2), by=x], data.table(x=c(1,1,1,1,1,2,2), new=c(1,2,1,2,1,1,2)), warning="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") # Fix for FR #2496 - catch `{` in `:=` expression in `j`: DT <- data.table(x=c("A", "A", "B", "B"), val =1:4) DT2 <- copy(DT)[, a := 1L] test(1134.1, DT[, {a := 1L}], DT2, warning="Caught and removed") test(1134.2, DT[, {b := 2L}, by=x], DT2[, b:=2L, by=x], warning="Caught and removed") # fix for bug #5069 if ("package:gdata" %in% search()) { DT <- data.table(a = c('asdfasdf','asdf','asdgasdgasdgasdg','sdg'), b = runif(4,0,1)) test(1135, write.fwf(DT, "df.txt"), NULL) } # FR #2693 and Gabor's suggestions from here: http://r.789695.n4.nabble.com/Problem-with-FAQ-2-8-tt4668878.html (correcting software according to FAQ 2.8) d1 <- data.table(id1 = c(1L, 2L, 2L, 3L), val = 1:4, key = "id1") d2 <- data.table(id2 = c(1L, 2L, 4L), val2 = c(11, 12, 14),key = "id2") d3 <- copy(d2) setnames(d3, names(d1)) test(1136.1, d1[d2, id1], data.table(id1=d2$id2, id1=c(1L,2L,NA), key="id1")) test(1136.2, d1[d2, id2], data.table(id1=d2$id2, id2=d2$id2, key="id1")) test(1136.3, d1[d3, i.id1], data.table(id1=d3$id1, i.id1=d3$id1, key="id1")) test(1136.4, d1[d2, val], data.table(id1=c(1L, 2L, 2L, 4L), val=c(1:3, NA), key="id1")) test(1136.5, d1[d2, val2], data.table(id1=c(1L, 2L,4L), val2=c(11,12,14), key="id1")) test(1136.6, d1[d3, list(val, i.val)], data.table(id1=c(1L, 2L,2L,4L), val=c(1:3, NA), i.val=c(11,12,12,14), key="id1")) test(1136.7, d1[d3, list(id1, i.id1, val, i.val)], data.table(id1=c(1L, 2L,2L,4L), id1=c(1L, 2L, 2L, NA), i.id1=c(1L, 2L, 2L, 4L), val=c(1:3, NA), i.val=c(11,12,12,14), key="id1")) test(1136.8, d1[d2], data.table(id1=c(1L,2L,2L,4L), val=c(1:3, NA), val2=c(11,12,12,14), key="id1")) test(1136.9, d1[J(2), id1], data.table(id1=c(2L), id1=c(2L), key="id1")) # id1 refers to x's id1 test(1136.10, d1[J(2), i.id1], data.table(id1=c(2L), i.id1=c(2L), key="id1")) # i.id1 refers to i's id1 DT <- data.table(x=c("A", "A", "C", "C"), y=1:4, key="x") test(1136.11, DT["C", i.x], data.table(x="C", i.x="C", key="x")) # test for FR #4979 DT <- data.table(x=1:5, y=6:10, z=11:15) test(1137.1, DT[, .SD, .SDcols=-1L], DT[, 2:3, with=FALSE]) test(1137.2, DT[, .SD, .SDcols=-(1:2)], DT[, 3, with=FALSE]) test(1137.3, DT[, .SD, .SDcols=-"y"], DT[, c(1,3), with=FALSE]) test(1137.4, DT[, .SD, .SDcols=-c("y", "x")], DT[, 3, with=FALSE]) test(1137.5, DT[, .SD, .SDcols=-which(names(DT) %in% c("x", "y", "z"))], null.data.table()) test(1137.6, DT[, .SD, .SDcols=c(1, -2)], error=".SDcols is numeric but has both") test(1137.7, DT[, .SD, .SDcols=c("x", -"y")], error="invalid argument to unary") test(1137.8, DT[, .SD, .SDcols=c(-1, "x")], error="Some items of .SDcols are") DT <- data.table(x=1:5, y=6:10, z=11:15, zz=letters[1:5]) test(1137.9, DT[, .SD, .SDcols=-grep("^z", names(DT))], DT[, 1:2, with=FALSE]) test(1137.10, DT[, .SD, .SDcols=-grep("^z", names(DT), value=TRUE)], DT[, 1:2, with=FALSE]) test(1137.11, DT[, .SD, .SDcols=-grep("^z", names(DT), value=TRUE, invert=TRUE)], DT[, 3:4, with=FALSE]) set.seed(45) DT = data.table(x=c("A", "A", "C", "C"), y=1:4, z=runif(4)) test(1137.12, DT[, lapply(.SD, sum), by=x, .SDcols=-"y"], DT[, lapply(.SD, sum), by=x, .SDcols="z"]) # test for FR #5020 - print.data.table gets new argument "row.names", default=TRUE. if FALSE, the row-names don't get printed # Thanks to Eddi for `capture.output` function! DT <- data.table(x=1:5, y=6:10) test(1138, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10")) # test for FR #2591 (format.data.table issue with column of class "formula") DT <- data.table(x=c(a~b, c~d+e), y=1:2) test(1139, capture.output(print(DT)), c(" x y", "1: a ~ b 1", "2: c ~ d + e 2")) # FR #4813 - provide warnings if there are remainders for both as.data.table.list(.) and data.table(.) X = list(a = 1:2, b = 1:3) test(1140, as.data.table(X), data.table(a=c(1,2,1), b=c(1,2,3)), warning="Item 1 is of size 2 but maximum") test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 is of size 2 but maximum") test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 is of size 2 but maximum") test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 is of size 2 but maximum") # Fix for bug #5098 - DT[, foo()] returns function definition. DT <- data.table(a=1:2) foo <- function() sum(1:5) test(1142, DT[, foo()], 15L) # Fix for bug #5106 - DT[, .N, by=y] was slow when "y" is not a column in DT DT <- data.table(x=sample.int(10, 1e6, replace=TRUE)) y <- DT$x te1 <- system.time(ans1 <- DT[, .N, by=x])[["elapsed"]] te2 <- system.time(ans2 <- DT[, .N, by=y])[["elapsed"]] test(1143.1, ans1, setnames(ans2, "y", "x")) test(1143.2, abs(te1-te2) < 1, TRUE) # Fix for bug #5104 - side-effect of fixing #2531 - `:=` with grouping (by) and assigning factor columns DT <- data.table(x=c(1,1,1,2,2), y=factor(letters[1:5])) test(1144, DT[, z := y, by=x], data.table(x=c(1,1,1,2,2), y=factor(letters[1:5]), z=1:5)) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 setattr(x, 'names', letters[1:5]) test(1144.1, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.numeric(x) setattr(x, 'names', letters[1:5]) test(1144.2, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.character(x) setattr(x, 'names', letters[1:5]) test(1144.3, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.factor(x) setattr(x, 'names', letters[1:5]) test(1144.4, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.Date(1:5, origin="2013-01-01") setattr(x, 'names', letters[1:5]) test(1144.5, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) # Fix for bug #5114 - .data.table.locked ISSUE DT <- data.table(x=1:5, y=6:10) xx <- DT[, .SD, .SDcols="y"] test(1145, xx[, y := as.numeric(y)], data.table(y = as.numeric(6:10))) # Fix for bug #5115 - set not adding columns on class that builds on data.table DT <- as.data.table(BOD) ans = copy(DT)[, Time := as.numeric(Time)] setattr(DT, "class", c("myclass", class(DT))) setattr(ans, 'class', class(DT)) test(1146.1, DT[, Time:= as.numeric(Time)], ans) DF <- as.data.frame(DT) test(1146.2, {set(DF, i=NULL, j=1L, value=seq_len(nrow(DF)));setattr(DF,"reference",NULL);DF}, data.frame(Time=1:nrow(BOD), demand=BOD$demand)) test(1146.3, set(DF, i=NULL, j="bla", value=seq_len(nrow(DF))), error="set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that.") # Feature - implemented fast radix order for numeric types (both +ve and -ve numerics). # note that if "x" is already a list, then the values will be modified by reference! # Note: 'ordernumtol' doesn't distinguish between NA and NaN whereas this one does! # R-wrapper is dradixorder set.seed(45) x <- rnorm(1e6)*1e4 test(1147.1, base::order(x), dradixorder(x, tol=numeric(0))) # base::order doesn't test with tolerance test(1147.2, ordernumtol(x), dradixorder(x)) tol = .Machine$double.eps^0.5 x <- c(8, NaN, Inf, -7.18918, 5.18909+0.07*tol, NA, -7.18918111, -Inf, NA, 5.18909, NaN, 5.18909-1.2*tol, 5.18909-0.04*tol) test(1147.3, dradixorder(x), c(6L, 9L, 2L, 11L, 8L, 7L, 4L, 12L, 5L, 10L, 13L, 1L, 3L)) if ("package:reshape2" %in% search()) { try(detach(package:reshape),silent=TRUE) # otherwise when reshape is loaded, can't access melt.data.table from reshape2. # try() in case reshape isn't loaded (detach gives not found error in that case) # Fix for case 2 in bug report #5149 - dcast.data.table dint aggregate properly when formula RHS has "." set.seed(45) DT = data.table(x=rep(1:5, each=3), y=runif(15, 0, 1)) ans = data.table(dcast(DT, x ~ ., mean, value.var="y")) setnames(ans, c("x", "V1")) setkey(ans, x) test(1148.1, dcast.data.table(DT, x ~ ., mean, value.var="y"), ans) # also quashed another bug with `.` in formula (when there's no aggregate function): DT <- data.table(a=sample(5), b=runif(5), c=5:1) ans1 = data.table(dcast(DT, a ~ ., value.var="c")) ans2 = data.table(dcast(DT, b+a ~ ., value.var="c")) setnames(ans1, c("a", "c")) setnames(ans2, c("b", "a", "c")) setkey(ans1, "a") setkey(ans2, "b", "a") test(1148.2, dcast.data.table(DT, a ~ ., value.var="c"), ans1) test(1148.3, dcast.data.table(DT, b+a~., value.var="c"), ans2) } # test for `iradixorder` when input is integer(0) and numeric(0) test(1149.1, iradixorder(integer(0)), integer(0)) test(1149.2, iradixorder(numeric(0)), error="iradixorder is only for integer") # more tests for `dcast.data.table` with formula being character and errors when formula is a hybrid set.seed(1) x <- data.table(a=rep(1:5, each=5), b=runif(25)) ### adding all extra arguments for no verbose during "test.data.table()" to all dcast.data.table tests test(1150.1, dcast.data.table(x, " a~ . ", value.var="b", fun=length), data.table(a=1:5, V1=5L, key="a")) test(1150.2, dcast.data.table(x, "a ~ c ", value.var="b"), error="Column 'c' not found") test(1150.3, dcast.data.table(x, a ~ a, value.var="c"), error="'value.var' column 'c' not found") # test uniqlengths set.seed(45) x <- sample(c(NA_integer_, 1:1e5), 1e7, TRUE) ox <- forder(x) o1 <- uniqlist(list(x), ox) test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) o1 <- uniqlist(list(x)) test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) # #5190 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #4979 implementation dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20)) sd.cols <- "v1" test(1152, dt[, lapply(.SD, mean), by=grp, .SDcols=sd.cols], dt[, list(v1=mean(v1)), by=grp]) # #5171 fix - setattr attribute non-character led to segfault x <- FALSE if ("package:bit" %in% search()) { try(detach(package:bit64),silent=TRUE) try(detach(package:bit),silent=TRUE) if (.devtesting) { test(1153, setattr(x, FALSE, FALSE), error="Attribute name must be") } else { test(1153, data.table:::setattr(x, FALSE, FALSE), error="Attribute name must be") } } # Fixed binary search capabilities for NA (for int and double) and NaN (for double): set.seed(1) DT <- data.table(x=sample(c(NA, NaN, Inf, 1:10), 100, TRUE), y=sample(c(NA, 1:10), 100, TRUE), z=sample(c(NA_character_, letters[1:10]), 100, TRUE)) setkey(DT, x) test(1154.1, DT[J(NaN)], DT[is.nan(x)]) test(1154.2, DT[J(NA_real_)], DT[is.na(x) & !is.nan(x)]) setkey(DT, y) test(1154.3, setcolorder(DT[J(NA_integer_)], c("x", "y", "z")), DT[is.na(y)]) setkey(DT, z) test(1154.4, setcolorder(DT[J(NA_character_)], c("x", "y", "z")), DT[is.na(z)]) # Fixing the binary search above for NA/NaN also fixes BUG #4918 dt1 <- data.table(x = c('red','orange','green'), y=c(1,2,NA), key='y') dt2 <- data.table(y = c(1,2,3,NA), z = c('a','b','c','missing data'), key='y') test(1155.1, merge(dt1, dt2, by=c('y')), data.table(y=dt1$y, x=dt1$x, z=dt2$z[1:3], key="y")) test(1155.2, dt2[dt1], data.table(y=dt1$y, z=dt2$z[1:3], x=dt1$x, key="y")) test(1155.3, dt1[dt2, nomatch=0L], data.table(y=dt1$y, x=dt1$x, z=dt2$z[1:3], key="y")) # NaN wasn't properly searched for in some cases. Fixed that. Here's the fix! dt <- structure(list(x = c(NaN, NaN, NaN, NaN, NaN, NA, NA, -3, -3, -3, -2, -2, -1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3), y = c(16L, 25L, 23L, 17L, 21L, 11L, 13L, 15L, 1L, 6L, 4L, 18L, 7L, 3L, 12L, 24L, 2L, 10L, 20L, 14L, 9L, 19L, 8L, 22L, 5L)), .Names = c("x", "y"), row.names = c(NA, -25L), class = c("data.table", "data.frame" )) setkey(dt, x) test(1155.4, dt[J(NaN)], dt[is.nan(x)]) test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)]) # Fix for (usually small) memory leak when grouping, #2648. # Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) before = gc()["Vcells",2] for (i in 1:50) DT[, sum(B), by=A] after = gc()["Vcells",2] test(1157, after < before+1) # +1 = 1MB # Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. # Similar for when dogroups writes less rows than allocated, #2648. DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) before = gc()["Vcells",2] for (i in 1:50) DT[ , unlist(.SD), by = 'k'] after = gc()["Vcells",2] test(1158, after < before+1) # tests for 'setDT' - convert list, DF to DT without copy x <- data.frame(a=1:4, b=5:8) test(1159.1, setDT(x), data.table(a=1:4, b=5:8)) x <- list(1:4, 5:8) test(1159.2, setDT(x), data.table(1:4, 5:8)) x <- list(1:4, 5:8) test(1159.3, setDT(x, FALSE), setnames(data.table(1:4, 5:8), c("", ""))) x <- list(a=1:4, b=5:8) test(1159.4, setDT(x), data.table(a=1:4, b=5:8)) x <- list(a=1:4, 5:8) test(1159.5, setDT(x), setnames(data.table(1:4, 5:8), c("a", "V1"))) x <- list(a=1:4, 5:8) test(1159.6, setDT(x, FALSE), setnames(data.table(1:4, 5:8), c("a", ""))) x <- data.table(a=1:4, b=5:8) test(1159.7, setDT(x), data.table(a=1:4, b=5:8)) x <- 1:5 test(1159.8, setDT(x), error="Argument 'x' to 'setDT' should be a") x <- list(1, 2:3) test(1159.9, setDT(x), error="All elements in argument 'x' to 'setDT'") # tests for setrev x <- sample(10) y <- rev(x) setrev(x) test(1160.1, y, x) x <- sample(c(1:10, NA), 21, TRUE) y <- rev(x) setrev(x) test(1160.2, y, x) x <- sample(runif(10)) y <- rev(x) setrev(x) test(1160.3, y, x) x <- sample(c(runif(10), NA, NaN), 21, TRUE) y <- rev(x) setrev(x) test(1160.4, y, x) x <- sample(letters) y <- rev(x) setrev(x) test(1160.5, y, x) x <- as.logical(sample(0:1, 20, TRUE)) y <- rev(x) setrev(x) test(1160.6, y, x) x <- list(1:10) test(1160.7, setrev(x), error="Input 'x' must be a vector") # tests for setreordervec # integer x <- sample(c(-10:10, NA), 100, TRUE) o <- base::order(x, na.last=FALSE) y <- copy(x) setreordervec(y, o) test(1161.1, x[o], y) # numeric x <- sample(c(NA, rnorm(10)), 100, TRUE) o <- base::order(x, na.last=FALSE) y <- copy(x) setreordervec(y, o) test(1161.2, x[o], y) # character x <- sample(c(NA, letters), 100, TRUE) o <- base::order(x, na.last=FALSE) y <- copy(x) setreordervec(y, o) test(1161.3, x[o], y) # tests for setreordervec DT <- data.table(x=sample(c(NA, -10:10), 2e2, TRUE), y=sample(c(NA, NaN, -Inf, Inf, -10:10), 2e2, TRUE), z=sample(c(NA, letters), 2e2, TRUE)) # when not sorted, should return FALSE test(1162.1, is.sorted(DT[[1L]]), FALSE) setkey(DT, x) test(1162.2, is.sorted(DT[[1L]]), TRUE) test(1162.3, is.sorted(DT[[2L]]), FALSE) setkey(DT, y) test(1162.4, is.sorted(DT[[2L]]), TRUE) test(1162.5, is.sorted(DT[[3L]]), FALSE) setkey(DT, z) test(1162.6, is.sorted(DT[[3L]]), TRUE) setkey(DT, x, y) test(1162.7, length(forder(DT, by=1:2)), 0) setkey(DT, x, z) test(1162.8, length(forder(DT, by=c(1L, 3L))), 0) setkey(DT, y, z) test(1162.9, length(forder(DT, by=2:3)), 0) setkey(DT) # test number 1162.10 skipped because if it fails it confusingly prints out as 1662.1 not 1662.10 test(1162.11, length(forder(DT, by=1:3)), 0) test(1162.12, is.sorted(DT, by=1:3), TRUE, warning="Use.*forder.*for efficiency in one step, so you have o as well if not sorted") test(1162.13, is.sorted(DT, by=2:1), FALSE, warning="Use.*forder.*for efficiency in one step, so you have o as well if not sorted") # FR #5152 - last on length=0 arguments x <- character(0) test(1163, last(x), character(0)) # Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well) a<-c("a","ä","ß","z") au<-iconv(a,"UTF8","latin1") test(1164.1, chmatch(a, au), match(a, au)) # Bug fix for #5117 - segfault when rbindlist on empty data.tables x <- as.data.table(BOD) y <- copy(x) test(1165, x[Time>100], rbindlist(list(x[Time > 100], y[Time > 200]))) # Bug fix for the #5300 - rbind(DT, NULL) should not result in error x <- as.data.table(BOD) test(1166, x, rbind(x, NULL)) # fix for bug #5307 - ordering with multiple columns in which at least one of them is a logical column foo = data.table(a=rep(c(0L,1L,0L,1L),2), b=rep(c(TRUE,TRUE,FALSE,FALSE),2), c=1L) test(1167, foo[, .N, by=list(b,a)], data.table(b=c(TRUE, TRUE, FALSE, FALSE), a=c(0L,1L,0L,1L), N=2L)) # fix for bug #5355 - rbindlist with factor columns and empty data.tables resulted in error. A <- data.table(x=factor(1), key='x') B <- data.table(x=factor(), key='x') test(1168.1, rbindlist(list(B,A)), data.table(x=factor(1))) # fix for bug #5120, it's related to rbind and factors as well - more or less similar to 1168.1 (#5355). Seems to have been fixed with that commit. Just adding test here. tmp1 <- as.data.table(structure(list(Year = 2013L, Maturity = structure(1L, .Label = c("<1", "1.0 - 1.5", "1.5 - 2.0", "2.0 - 2.5", "2.5 - 3.0", "3.0 - 4.0", "4.0 - 5.0", ">5.0"), class = "factor"), Quality = structure(2L, .Label = c(">BBB", "BBB", "BB", "B", "CCC", "