A mlr3::DataBackend using dplyr::tbl() from packages dplyr/dbplyr. This includes tibbles. Allows to connect a mlr3::Task to a out-of-memory data base.

Format

R6::R6Class object inheriting from mlr3::DataBackend.

Construction

DataBackendDplyr$new(data, primary_key = NULL)

Alternatively, use as_data_backend on a dplyr::tbl() which will construct a DataBackend with a copy of the data.

Fields

  • nrow :: integer(1)
    Number of rows (observations).

  • ncol :: integer(1)
    Number of columns (variables), including the primary key column.

  • colnames :: character()
    Returns vector of all column names, including the primary key column.

  • rownames :: integer() | character()
    Returns vector of all distinct row identifiers, i.e. the primary key column.

  • hash :: character(1)
    Returns a unique hash for this backend. This hash is cached.

  • data_formats :: character()
    Vector of supported data formats. A specific format of these supported formats can be picked in the $data() method.

Methods

Examples

# Backend using a in-memory tibble data = tibble::as.tibble(iris)
#> Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics). #> This warning is displayed once per session.
data$Sepal.Length[1:30] = NA data$row_id = 1:150 b = DataBackendDplyr$new(data, primary_key = "row_id") # Object supports all accessors of DataBackend print(b)
#> <DataBackendDbplyr> (150x6) #> #> Public: colnames, data_formats, data(), distinct(), hash, head(), #> missings(), ncol, nrow, primary_key, rownames #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species row_id #> 1: NA 3.5 1.4 0.2 setosa 1 #> 2: NA 3.0 1.4 0.2 setosa 2 #> 3: NA 3.2 1.3 0.2 setosa 3 #> 4: NA 3.1 1.5 0.2 setosa 4 #> 5: NA 3.6 1.4 0.2 setosa 5 #> 6: NA 3.9 1.7 0.4 setosa 6
b$nrow
#> [1] 150
b$ncol
#> [1] 6
b$colnames
#> [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species" #> [6] "row_id"
b$data(rows = 100:101, cols = "Species")
#> Species #> 1: versicolor #> 2: virginica
b$distinct(b$rownames, "Species")
#> $Species #> [1] "setosa" "versicolor" "virginica" #>
# Classification task using this backend task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species") print(task)
#> <TaskClassif:iris_tibble> (150 x 5) #> Target: Species #> Features (4): #> * dbl (4): Petal.Length, Petal.Width, Sepal.Length, Sepal.Width #> #> Public: backend, cbind(), class_n, class_names, clone(), col_info, #> col_roles, data_formats, data(), droplevels(), feature_names, #> feature_types, filter(), formula(), groups, hash, head(), id, #> levels(), measures, missings(), ncol, negative, nrow, positive, #> properties, rbind(), replace_features(), row_ids, row_roles, #> select(), set_col_role(), set_row_role(), target_names, task_type, #> truth(), weights
task$head()
#> Species Petal.Length Petal.Width Sepal.Length Sepal.Width #> 1: setosa 1.4 0.2 NA 3.5 #> 2: setosa 1.4 0.2 NA 3.0 #> 3: setosa 1.3 0.2 NA 3.2 #> 4: setosa 1.5 0.2 NA 3.1 #> 5: setosa 1.4 0.2 NA 3.6 #> 6: setosa 1.7 0.4 NA 3.9
# Create a temporary SQLite data base con = DBI::dbConnect(RSQLite::SQLite(), ":memory:") dplyr::copy_to(con, data) tbl = dplyr::tbl(con, "data") # Define a backend on a subset of the data base tbl = dplyr::select_at(tbl, setdiff(colnames(tbl), "Sepal.Width")) # do not use column "Sepal.Width" tbl = dplyr::filter(tbl, row_id %in% 1:120) # Use only first 120 rows b = DataBackendDplyr$new(tbl, primary_key = "row_id") print(b)
#> <DataBackendDbplyr> (120x5) #> #> Public: colnames, data_formats, data(), distinct(), hash, head(), #> missings(), ncol, nrow, primary_key, rownames #> Sepal.Length Petal.Length Petal.Width Species row_id #> 1: NA 1.4 0.2 setosa 1 #> 2: NA 1.4 0.2 setosa 2 #> 3: NA 1.3 0.2 setosa 3 #> 4: NA 1.5 0.2 setosa 4 #> 5: NA 1.4 0.2 setosa 5 #> 6: NA 1.7 0.4 setosa 6
# Query disinct values b$distinct(b$rownames, "Species")
#> $Species #> [1] "setosa" "versicolor" "virginica" #>
# Query number of missing values b$missings(b$rownames, b$colnames)
#> Sepal.Length Petal.Length Petal.Width Species row_id #> 30 0 0 0 0
# Note that SQLite does not support factors, column Species has been converted to character lapply(b$head(), class)
#> $Sepal.Length #> [1] "numeric" #> #> $Petal.Length #> [1] "numeric" #> #> $Petal.Width #> [1] "numeric" #> #> $Species #> [1] "character" #> #> $row_id #> [1] "integer" #>
# Cleanup rm(tbl) DBI::dbDisconnect(con)