--- title: "Storing & grouping by vessel trips" output: html_document: toc: true vignette: > %\VignetteIndexEntry{Trip metadata} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, message=FALSE, echo=FALSE} logging::setLevel('WARN') library(unittest) # Redirect ok() output to stderr ok <- function(...) capture.output(unittest::ok(...), file = stderr()) library(mfdb) # Remove our attributes from a dataframe - only used for testing unattr <- function (obj) { attributes(obj) <- attributes(obj)[c('names', 'row.names', 'class')] obj } ``` Samples can be assigned to a *trip*, which provides a way of including socioeconomic data into MFDB. The following examples use the following table_string helper to succintly define tables: ```{r} # Convert a string into a data.frame table_string <- function (text, ...) read.table( text = text, blank.lines.skip = TRUE, header = TRUE, stringsAsFactors = FALSE, ...) ``` Then connect to a database and set up some areas/divisions: ```{r} mdb <- mfdb(tempfile(fileext = '.duckdb')) mfdb_import_area(mdb, table_string(' name division size 45G01 divA 10 45G02 divA 200 45G03 divB 400 ')) ``` ## Importing data Trips can have a *start_port* and *end_port* associated with them, before we can do this we need to define any ports we use. We do this with ``mfdb_import_port_taxonomy``: ```{r} mfdb_import_port_taxonomy(mdb, table_string(' name description institute latitude longitude ISHFF Hafnarfjordur ISL 64.04 21.57 NOHVJ Hvalfjordur ISL 64.21 21.45 ISKEF Keflavik ISL 64.00 22.33 ISREK Reykjavik ISL 64.09 21.55 ISSEJ Seydisfjordur ISL 65.15 13.55 ISKJF Skerjafjordur ISL 64.07 21.54 ')) ``` We can now define trips that use these ports: ```{r} mfdb_import_trip_taxonomy(mdb, table_string(' name start_date end_date crew oil_consumption start_port end_port T1 2019-01-21 2019-02-11 4 3000 ISKEF ISREK T2 2019-01-24 2019-02-14 5 9000 ISKEF ISSEJ T3 2019-04-21 2019-05-11 4 3000 ISREK ISKEF T4 2019-04-24 2019-05-14 5 9000 ISSEJ ISKEF ')) ``` There is no formal link between trips and tows, so you can link a sample to a tow without recording trips. But we can create tows and vessels to use: ```{r} mfdb_import_tow_taxonomy(mdb, table_string(' name latitude longitude depth length T1a 64.1 -23.15 98.82 10 T1b 64.1 -23.15 98.82 10 T2a 64.1 -23.15 98.82 10 T2b 64.1 -23.15 98.82 10 T3a 64.1 -23.15 98.82 10 T3b 64.1 -23.15 98.82 10 T4a 64.1 -23.15 98.82 10 T4b 64.1 -23.15 98.82 10 ')) ``` ```{r} mfdb_import_vessel_taxonomy(mdb, table_string(' name full_name V1 "Vessel 1" V2 "Vessel 2" ')) ``` And finally import data that is associated to a vessel/trip/tow: ```{r} mfdb_import_survey(mdb, data_source = "cod2000", table_string(" year month areacell species vessel trip tow length count 2019 1 45G01 COD V1 T1 T1a 10 358 2019 1 45G01 COD V1 T1 T1a 20 320 2019 1 45G01 COD V1 T1 T1a 30 162 2019 1 45G02 COD V1 T1 T1b 10 240 2019 1 45G02 COD V1 T1 T1b 20 278 2019 1 45G02 COD V1 T1 T1b 30 122 2019 2 45G01 COD V2 T2 T2a 10 255 2019 2 45G01 COD V2 T2 T2a 20 138 2019 2 45G01 COD V2 T2 T2a 30 168 2019 2 45G02 COD V2 T2 T2b 10 349 2019 2 45G02 COD V2 T2 T2b 20 106 2019 2 45G02 COD V2 T2 T2b 30 262 2019 4 45G01 COD V1 T3 T3a 10 395 2019 4 45G01 COD V1 T3 T3a 20 214 2019 4 45G01 COD V1 T3 T3a 30 195 2019 4 45G02 COD V1 T3 T3b 10 396 2019 4 45G02 COD V1 T3 T3b 20 325 2019 4 45G02 COD V1 T3 T3b 30 269 2019 5 45G01 COD V2 T4 T4a 10 239 2019 5 45G01 COD V2 T4 T4a 20 309 2019 5 45G01 COD V2 T4 T4a 30 162 2019 5 45G02 COD V2 T4 T4b 10 342 2019 5 45G02 COD V2 T4 T4b 20 175 2019 5 45G02 COD V2 T4 T4b 30 299 ")) ``` ## Querying data We can now use the ``mfdb_sample_*`` functions to select this data back out again. We can group and filter by any of the trip attributes. For instance, to group/filter by ``trip_start_date``: ```{r} agg_data <- mfdb_sample_count(mdb, c('trip_start_date', 'length'), list( trip_start_date = '2019-01-21', length = mfdb_unaggregated())) agg_data ``` ```{r, message=FALSE, echo=FALSE} # Convert date to char to avoid timezone complications agg_data[[1]]$trip_start_date <- as.character(agg_data[[1]]$trip_start_date) ok(ut_cmp_equal(unattr(agg_data[[1]]), table_string(' year step area trip_start_date length number all all all 2019-01-21 10 598 all all all 2019-01-21 20 598 all all all 2019-01-21 30 284 ')), "trip_start_date = '2019-01-21'") ``` Note that we've summed T1a and T1b, since their start date matches. We can use ``mfdb_group``() to aggregate several ports, for instance: ```{r} agg_data <- mfdb_sample_count(mdb, c('trip_start_port', 'length'), list( trip_start_port = mfdb_group(KEF = 'ISKEF', other = c('ISREK', 'ISSEJ')), length = mfdb_unaggregated())) agg_data ``` ```{r, message=FALSE, echo=FALSE} # Translated trip_start_port in and out of attributes ok(ut_cmp_equal(unattr(agg_data[[1]]), table_string(' year step area trip_start_port length number all all all KEF 10 1202 all all all KEF 20 842 all all all KEF 30 714 all all all other 10 1372 all all all other 20 1023 all all all other 30 925 ')), "trip_start_port = mfdb_group(...)") ``` We can also use ``mfdb_dplyr_sample`` to re-extract data, joining with port metadata: ```{r} # TODO: Use native pipe syntax once available x <- mfdb_dplyr_sample(mdb, c('trip_start_port', 'trip_end_port', 'trip_start_port_latitude')) x <- dplyr::select(x, 'year', 'month', 'trip_start_port', 'trip_end_port', 'trip_start_port_latitude', 'count') as.data.frame(x) ``` ```{r, message=FALSE, echo=FALSE} ok(ut_cmp_equal( mfdb_sample_count( mdb, c('month', 'areacell', 'tow', 'length', 'trip_start_port', 'trip_end_port'), list( month=mfdb_unaggregated(), areacell=mfdb_unaggregated(), tow=mfdb_unaggregated(), trip_start_port=mfdb_unaggregated(), trip_end_port=mfdb_unaggregated(), length = mfdb_unaggregated(), null = NULL))[[1]]$number, as.data.frame(mfdb_dplyr_sample( mdb, c('trip_start_port', 'trip_end_port', 'trip_start_port_latitude')))$count), "dplyr and sample_count produce same results") ``` ```{r} mfdb_disconnect(mdb) ```