A mlr3::DataBackend using dplyr::tbl() from packages dplyr/dbplyr. This includes tibbles. Allows to let a mlr3::Task interface an out-of-memory data base.

Format

R6::R6Class object inheriting from mlr3::DataBackend.

Construction

DataBackendDplyr$new(data, primary_key = NULL, strings_as_factors = TRUE)
  • data :: dplyr::tbl()
    The data object.

  • primary_key :: character(1)
    Name of the primary key column.

  • strings_as_factors :: logical(1) || character()
    Either a character vector of column names to convert to factors, or a single logical flag: if FALSE, no column will be converted, if TRUE all string columns (except the primary key). The backend is queried for distinct values of the respective columns and their levels are stored in $levels.

Alternatively, use mlr3::as_data_backend() on a dplyr::tbl() which will construct a DataBackend for you.

Fields

All fields from mlr3::DataBackend, and additionally:

  • levels :: named list()
    List of factor levels, named with column names. The columns get automatically converted to factors in $data() and head().

Methods

All methods from mlr3::DataBackend.

Examples

# Backend using a in-memory tibble data = tibble::as_tibble(iris) data$Sepal.Length[1:30] = NA data$row_id = 1:150 b = DataBackendDplyr$new(data, primary_key = "row_id") # Object supports all accessors of DataBackend print(b)
#> <DataBackendDplyr> (150x6) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species row_id #> NA 3.5 1.4 0.2 setosa 1 #> NA 3.0 1.4 0.2 setosa 2 #> NA 3.2 1.3 0.2 setosa 3 #> NA 3.1 1.5 0.2 setosa 4 #> NA 3.6 1.4 0.2 setosa 5 #> NA 3.9 1.7 0.4 setosa 6 #> [...] (144 rows omitted)
b$nrow
#> [1] 150
b$ncol
#> [1] 6
b$colnames
#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" #> [6] "row_id"
b$data(rows = 100:101, cols = "Species")
#> Species #> 1: versicolor #> 2: virginica
b$distinct(b$rownames, "Species")
#> $Species #> [1] "setosa" "versicolor" "virginica" #>
# Classification task using this backend task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species") print(task)
#> <TaskClassif:iris_tibble> (150 x 5) #> * Target: Species #> * Properties: multiclass #> * Features (4): #> - dbl (4): Petal.Length, Petal.Width, Sepal.Length, Sepal.Width
task$head()
#> Species Petal.Length Petal.Width Sepal.Length Sepal.Width #> 1: setosa 1.4 0.2 NA 3.5 #> 2: setosa 1.4 0.2 NA 3.0 #> 3: setosa 1.3 0.2 NA 3.2 #> 4: setosa 1.5 0.2 NA 3.1 #> 5: setosa 1.4 0.2 NA 3.6 #> 6: setosa 1.7 0.4 NA 3.9
# Create a temporary SQLite data base con = DBI::dbConnect(RSQLite::SQLite(), ":memory:") dplyr::copy_to(con, data) tbl = dplyr::tbl(con, "data") # Define a backend on a subset of the data base tbl = dplyr::select_at(tbl, setdiff(colnames(tbl), "Sepal.Width")) # do not use column "Sepal.Width" tbl = dplyr::filter(tbl, row_id %in% 1:120) # Use only first 120 rows b = DataBackendDplyr$new(tbl, primary_key = "row_id") print(b)
#> <DataBackendDplyr> (120x5) #> Sepal.Length Petal.Length Petal.Width Species row_id #> NA 1.4 0.2 setosa 1 #> NA 1.4 0.2 setosa 2 #> NA 1.3 0.2 setosa 3 #> NA 1.5 0.2 setosa 4 #> NA 1.4 0.2 setosa 5 #> NA 1.7 0.4 setosa 6 #> [...] (114 rows omitted)
# Query disinct values b$distinct(b$rownames, "Species")
#> $Species #> [1] "setosa" "versicolor" "virginica" #>
# Query number of missing values b$missings(b$rownames, b$colnames)
#> Sepal.Length Petal.Length Petal.Width Species row_id #> 30 0 0 0 0
# Note that SQLite does not support factors, column Species has been converted to character lapply(b$head(), class)
#> $Sepal.Length #> [1] "numeric" #> #> $Petal.Length #> [1] "numeric" #> #> $Petal.Width #> [1] "numeric" #> #> $Species #> [1] "factor" #> #> $row_id #> [1] "integer" #>
# Cleanup rm(tbl) DBI::dbDisconnect(con)