Produce MA-plot of omics data.

Produce MA-plot of omics data, where jammaplot() uses base R graphics, ggjammaplot() uses ggplot2 graphics.

ggjammaplot(
  x,
  detail_factor = 1,
  nbin_factor = 1,
  bw_factor = 1,
  assay_name = 1,
  useMedian = FALSE,
  controlSamples = NULL,
  centerGroups = NULL,
  controlFloor = NA,
  naControlAction = c("row", "floor", "min", "na"),
  naControlFloor = 0,
  colramp = c("transparent", "lightblue", "blue", "navy", "orange", "orangered2"),
  groupedX = TRUE,
  grouped_mad = TRUE,
  outlierMAD = 5,
  mad_row_min = 4,
  displayMAD = FALSE,
  noise_floor = 0,
  noise_floor_value = NA,
  naValue = NA,
  centerFunc = centerGeneData,
  whichSamples = NULL,
  useRank = FALSE,
  titleBoxColor = "lightgoldenrod1",
  titleCex = 1,
  outlierColor = "lemonchiffon",
  fillBackground = TRUE,
  maintitle = NULL,
  subtitle = NULL,
  summary = "mean",
  difference = "difference",
  transFactor = 0.25,
  doPlot = TRUE,
  highlightPoints = NULL,
  highlightPch = 21,
  highlightCex = 1.5,
  highlightColor = NULL,
  doHighlightLegend = TRUE,
  ablineH = c(-2, 0, 2),
  base_size = 12,
  panel.grid.major.colour = "grey90",
  panel.grid.minor.colour = "grey95",
  return_type = c("ggplot", "data"),
  xlim = NULL,
  ylim = c(-6, 6),
  ncol = NULL,
  nrow = NULL,
  blankPlotPos = NULL,
  verbose = FALSE,
  ...
)

jammaplot(
  x,
  assay_name = NULL,
  maintitle = NULL,
  titleBoxColor = "#DDBB9977",
  subtitleBoxColor = titleBoxColor,
  centerGroups = NULL,
  controlSamples = colnames(x),
  controlFloor = NA,
  naControlAction = c("row", "floor", "min", "na"),
  naControlFloor = 0,
  controlIndicator = c("labelstar", "titlestar", "none"),
  sample_labels = NULL,
  useMedian = FALSE,
  useMean = NULL,
  ylim = c(-4, 4),
  xlim = NULL,
  highlightPoints = NULL,
  outlierMAD = 5,
  outlierRowMin = 5,
  displayMAD = FALSE,
  groupedMAD = TRUE,
  colramp = c("white", "lightblue", "blue", "navy", "orange", "orangered2"),
  colrampOutlier = NULL,
  outlierColor = "lemonchiffon",
  whichSamples = NULL,
  maintitleCex = 1.8,
  subtitle = NULL,
  subtitlePreset = "bottomleft",
  subtitleAdjPreset = "topright",
  titleCexFactor = 1,
  titleCex = NULL,
  doTitleBox = TRUE,
  titleColor = "black",
  titleFont = 2,
  titlePreset = "top",
  titleAdjPreset = "top",
  xlab = "",
  xlabline = 2,
  ylab = "",
  ylabline = 1.5,
  groupSuffix = NULL,
  highlightPch = 21,
  highlightCex = 1.5,
  highlightColor = "#00AAAA66",
  doHighlightPolygon = FALSE,
  highlightPolygonAlpha = 0.3,
  doHighlightLegend = TRUE,
  smoothPtCol = "#00000055",
  margins = c(2.5, 0.5, 2, 0.2),
  outer_margins = c(0, 1.5, 0, 0.2),
  useRaster = TRUE,
  ncol = NULL,
  nrow = NULL,
  doPar = TRUE,
  las = 2,
  groupedX = TRUE,
  customFunc = NULL,
  filterNA = TRUE,
  filterNAreplacement = NA,
  filterNeg = FALSE,
  noise_floor = 0,
  noise_floor_value = NA,
  filterFloor = NULL,
  filterFloorReplacement = NULL,
  transFactor = 0.18,
  nrpoints = 0,
  smoothScatterFunc = jamba::plotSmoothScatter,
  applyRangeCeiling = TRUE,
  doTxtplot = FALSE,
  ablineV = 0,
  ablineH = c(-2, 0, 2),
  blankPlotPos = NULL,
  fillBackground = TRUE,
  useRank = FALSE,
  ma_method = c("jammacalc", "old"),
  panel_hook_function = NULL,
  doPlot = TRUE,
  verbose = FALSE,
  ...
)

Arguments

x

numeric object usually a matrix that contains values with measurement rows, and sample/observation columns. For example, with gene or protein expression data, the genes or proteins (or the assays of genes or proteins) are represented in rows, and obtained samples are represented in columns. Alternatively x can be SummarizedExperiment object, used alongside argument assay_name.

detail_factor

numeric used to adjust the level of detail, as a multiplier for nbin_factor and bw_factor.

nbin_factor

numeric value used to adjust the number of bins used to display the MA-plots, where values higher than 1 increase the resolution and level of detail, and values below 1 decrease the resolution. Note the number of bins are already adjusted based upon the square root of the number of plot panels, and nbin_factor applied to that value.

bw_factor

numeric used to adjust the resolution of the 2-dimensional bandwidth calculation, where higher values create more detailed density, and lower values create a smoother density across the range of data. In some cases, the ggplot panel aspect ratio diverges from 1:1, in which case bw_factor can be used to expand the bandwidth by y-axis, or y-axis, respectively. For example, if the density appears short-wide, try bw_factor=c(1.2, 1), if the density appears tall-skinny, try bw_factor=c(1, 1.2).

assay_name

character used when x is a SummarizedExperiment object, to determine which assay matrix to use for the MA plots. When assay_name=NULL the first assay entry is used, for example assays(x)[[1]].

useMedian

logical indicates whether to center data using the median value, where useMedian=FALSE by default. The median is preferred in cases where outliers should not influence centering. The mean is preferred in cases where the data should visualize data in a manner consistent with downstream parametric statistical analysis. When a particular sample represents a technical outlier, one option to visualize data without being skewed by the outlier is to define controlSamples to exclude the outlier sample(s). In this way, data centering will be applied using the non-outlier samples as reference.

controlSamples

character vector of colnames(x) passed to centerGeneData() which defines the control samples during the data centering step. By default, and the most common practice, MA-plots are calculated across all samples, which effectively uses all colnames(x) as controlSamples. However, it is quite useful sometimes to provide a subset of samples especially if there are known quality samples, to which new samples of unknown quality are being compared.

centerGroups

character vector of groups passed to jamma::centerGeneData() which determines how data is centered. Each group is centered independently, to enable visual comparisons within each relevant centering group. It is useful to center within batches or within subsets of samples that are not intended to be compared to one another. Another useful alternative is to center by each sample group in order to view the variability among group replicates, which should be much lower than variability across sample groups. See centerGeneData() for more specific examples.

colramp

one of several inputs recognized by jamba::getColorRamp(). It typically recognizes either the name of a color ramp from RColorBrewer, the name of functions from the viridis package such as viridis::viridis(), or single R colors, or a vector of R colors. When a single color is supplied, a gradient is created from white to that color, where the default base color can be customized with defaultBaseColor="black" for example.

groupedX

logical indicating whether the x-axis value, which represents the median or mean value, should be calculated independently for each group when centerGroups is used with multiple groups. Typically groupedX=TRUE is recommended, however it can be beneficial to share an overall x-axis value in specific circumstances.

grouped_mad

logical indicating whether the MAD factor calculation of variability among samples should be performed independently for each group when centerGroups is used with multiple groups. Typically grouped_max=TRUE is recommended, however it can be beneficial to share an overall MAD factor threshold across all samples in specific circumstances.

outlierMAD

numeric threshold above which a MA-plot panel MAD factor is considered an outlier. When a MA-plot panel is considered an outlier, the outlierColramp or outlierColor is applied to the panel color ramp to display a visual indication.

mad_row_min

numeric value indicating the minimum x-axis value, calculated using either median or mean as defined by argument useMedian, at or above which a measurement is used in the MAD factor calculation. This threshold is useful to restrict the MAD variability calculation to measurements (rows in x) with signal that meets a minimum noise threshold.

displayMAD

logical indicating whether to display each MA-plot panel MAD factor (median absolute deviation). A MAD value for each panel is calculated by taking the median absolute deviation from zero across all points, using points whose mean value is equal or greater than outlierRowMin. The overall MAD is defined by the median MAD from the MA-plot panels. The MAD factor is defined as the ratio of each MA-plot panel MAD value to the overall MAD value, and therefore most MAD factor values should be roughly 1. The overall MAD value is defined by the median across all samples when groupedMAD=FALSE, or defined within each centerGroup when groupedMAD=TRUE. A value with MAD factor 2 is interpreted as a sample whose median deviation from zero is twice as high as the typical sample, which is a reasonably indication that this sample has twice the inherent level of noise compared to other samples. Note that MAD values should be interpreted within sample processing batches if relevant, or within logical experimental units -- roughly interpreted to mean sets of samples within which direct statistical comparisons are intended to be applied. For example, gene expression data that include brain and liver samples would probably use centerGroups for brain and liver to be centered separately, therefore the MAD factors should be separately calculated for brain and for liver.

noise_floor, noise_floor_value

numeric to define a numeric floor, or NULL for no numeric floor. Values at or below noise_floor are set to noise_floor_value, intended for two potential uses:

Filter out value below a threshold, so they do not affect centering.
- This option is valuable to remove zeros when a zero 0 is considered "no measurement observed", typically for count data such as RNA-seq, NanoString, and especially single-cell protocols or other protocols that produce a large number of missing values.
- One can typically tell whether input data includes zero 0 values by the presence of characteristic 45-degree angle lines originating from x=0 angled toward the right. The points along this line are rows with more measurements of zero than non-zero, there this sample has a non-zero value.
Set values at a noise floor to the noise floor, to retain the measurement but minimize the effect during centering to the lowest realiable measurement for the platform technology.
- This value may be set to a platform noise floor for something like microarray data where the intensity may be unreliable below a threshold; or
- for quantitative PCR measurements where cycle threshold (Ct) values may become unreliable, for example above CT=40 or CT=35. Data is often transformed to abundance with 2 ^ (40 - CT) then log2-transformed for analysis. In this case, to apply a noise_floor effective for CT=35, one would use noise_floor=5.

naValue

character string used to convert values of NA to something else. This argument is useful when a numeric matrix may contain NA values but would prefer them to be, for example, 0.

centerFunc

function used to supply a custom data centering function. In practice this argument should rarely be changed.

whichSamples

NULL or integer vector, representing an index subset of samples to include in the MA-plots. When whichSamples represents a subset of samples in x, the MA-plot calculations are performed on all samples, then only samples in whichSamples are displayed. This argument keeps the MA-plot calculations consistent even when viewing only one or a subset of samples in more detail.

useRank

logical indicating whether to create column-wide ranks, then create MA-plots using the rank data. When useRank=TRUE the y-axis represents the rank difference from mean, and the x-axis represents the mean rank. Using useRank=TRUE is a good method to evaluate whether data can be normalized, or whether data across samples is inherently noisy.

titleBoxColor

character vector of R colors, where titleBoxColor is equal to ncol(x), or where names(titleBoxColor) matches colnames(x). When supplied, each plot panel strip background will be colored accordingly.

outlierColor

character string representing one R color, used when colrampOutlier is NULL and when outlierMAD is defined. This color is used for MA-plot outlier panels by substituting the first color from the colramp color ramp, to act as a visual cue that the panel represents an outlier.

fillBackground

logical currently used for base R graphics output, and passed to jamba::plotSmoothScatter(), indicating whether to fill the plot panel using the first color in the color ramp for each MA-plot panel, or when a plot panel is an outlier, it uses outlierColor. This argument is useful for all plot panels especially when the colramp base color is not white (or otherwise does not match the background of the plot device, for example colramp="viridis"). This argument is also used with outlierColor to add visual emphasis to plot panels where the MAD factor exceeds outlierMAD.

maintitle

character string with the title displayed above all individual MA-plot panels. It will appear in the top outer margin.

subtitle

NULL or character vector to be drawn at the bottom left corner of each plot panel, the location is defined by subtitlePreset.

transFactor

numeric adjustment to the visual density of smooth scatter points. For base R graphics, this argument is passed to jamba::plotSmoothScatter(). The argument value is based upon graphics::smoothScatter() argument transformation, which uses default function(x)x^0.25. The transFactor is equivalent to the exponential in the form: function(x)x^transFactor. Lower values make the point density more visually intense, higher values make the point density less visually intense.

doPlot

logical indicating whether to create plots. When doPlot=FALSE only the MA-plot panel data is returned.

highlightPoints

optional set of rows to highlight on each MA-plot panel, drawn as a set of points on top of the in one of the following forms:

character vector matching rownames(x), indicating a single set of points to highlight in one color defined in highlightColor. Internally, it is converted to a list containing one vector.
list of character vectors, each containing rownames(x). Each vector is highlighted as a set, using colors defined in highlightColor which ideally should have length equal to length(highlightPoints).
NULL no points are highlighted.

highlightCex

numeric vector used when highlightPoints is defined. It is recycled to length(highlightPoints) after highlightPoints is converted to a list if necessary. Values are therefore applied to each set of points in the list. It can be supplied as a list, which will be recycled to length(highlightPoints) and applied in order to each vector of points highlighted.

highlightColor

character vector used when highlightPoints is defined. It is recycled to length(highlightPoints) after highlightPoints is converted to a list if necessary. Colors are therefore applied to each set of points in the list.

doHighlightLegend

logical indicating whether to print a color legend when highlightPoints is defined. The legend is displayed in the bottom outer margin of the page using outer_legend(), and the page is adjusted to add bottom outer margin. When the legend is particularly large, it may be preferable to hide the color legend and display the legend another way, for example legend().

ablineH, ablineV

numeric vector indicating position of horizontal and vertical lines in each MA-plot panel. Either argument can be supplied as a list, which will be applied to each plot panel in order, which allows specifying specific abline values in individual panels.

xlim

numeric or NULL to define a fixed numeric range for x-axis values. When xlim=NULL the ranges are defined by the numeric x-axis values in each centerGroups grouping, so that each group can have its own independent x-axis ranges. When there is a large proportion of values with x=0 (or near zero), there are two options to reduce the point density near zero, which can improve visibility of non-zero points:

Use noise_floor=0 and noise_floor_value=NA, which replaces values at or below zero with NA, thereby hiding these points from each plot panel. Note the NA values are also not used during data centering calculations.
Define xlim=c(0.001, 20) and applyRangeCeiling=FALSE, which defines the x-axis minimum slightly above zero, and applyRangeCeiling=FALSE does not display points outside the x-axis range at the plot limits, thereby hiding those points. This method does not replace values with NA, therefore all non-NA values are used during data centering.

ylim, xlim

NULL or numeric vector length=2 indicating the y-axis and x-axis ranges, respectively. The values are useful to define consistent dimensions across all panels. The default ylim=c(-4, 4) represents 16-fold up and down range in normal space, assuming the data content has already been log2 transformed, and is typically a reasonable starting point for most purposes. Even if numeric values are all between -1.5 and 1.5, it is still recommended to keep a range in context of c(-4, 4), or an appropriate and consistent range given the data content, so the data has some visual context. The range c(-4, 4) should be adjusted relative to the typical ranges expected for the data.

ncol, nrow

integer number of MA-plot panel columns and rows passed to graphics::par("mfrow") when doPar=TRUE. When only one value is supplied, nrow or ncol, the other value is defined by ncol(x) and blankPlotPos so all panels can be contained on one page. When nrow and ncol are defined such that multiple pages are produced, each page will be annotated with maintitle and doHighlightLegend if relevant.

blankPlotPos

NULL or integer vector indicating plot panel positions to be drawn blank, and therefore skipped. This argument is intended to allow manual placement of plot panels with spacing that reinforces a logical layout. Plot panels are drawn in the exact order of colnames(x) received, with panels placed into a number of columns and rows defined with ncol, nrow or jamba::decideMfrow() when doPar=TRUE. Blank panel positions are intended to help customize the visual alignment of MA-plot panels. The mechanism is similar to ggplot2::facet_wrap() except that blank positions can be manually defined by what makes sense for the experiment design. This method is not particularly user-friendly, but allows fine control over plot panel spacing.

verbose

logical indicating whether to print verbose output.

...

additional parameters sent to downstream functions, jamba::plotSmoothScatter(), jammacalc(), centerGeneData().

useMean

(deprecated) logical, use useMedian. This argument indicates whether to center data using the mean value. When useMean=NULL the argument useMedian is preferred. For backward compatibility, when useMean is not NULL, then useMedian is defined by useMedian <- !useMean.

outlierRowMin

numeric value indicating the minimum mean value as displayed on the MA-plot panel x-axis, in order for the row to be included in MAD calculations. This argument is intended to prevent measurements whose mean value is below a noise threshold from being included, therefore only including points whose mean measurement is above noise and represents "typical" variability.

groupedMAD

logical indicating how the MAD calculation should be performed: groupedMAD=TRUE (default) calculates the median absolute deviation (MAD) from y=0 for each sample per centerGroups value, then the median MAD value for all samples in each group. The corresponding MAD factor is defined within each group. When groupedMAD=FALSE, the median MAD value is calculated across all samples, regardless of centerGroups value, from which the MAD factor is defined.

colrampOutlier

one of several inputs recognized by jamba::getColorRamp() to define a specific color ramp for MA-plot outlier panels, used when outlierMAD is defined. When colrampOutlier is NULL the outlierColor is used.

maintitleCex

numeric cex character expansion used to resize the maintitle.

subtitlePreset

character string passed to jamba::coordPresets(). The default subtitlePreset="bottomleft" defines the bottom-left corner of each panel.

subtitleAdjPreset

character string passed to jamba::coordPresets(). The default subtitleAdjPreset="topright" places labels to the top-right of the subtitle position, which by default is the bottom-left corner of each panel.

doTitleBox

logical indicating whether to draw plot titles using a colored box. When doTitleBox=TRUE the jamba::drawLabels() is called to display a label box at the top of each plot panel, with drawBox=TRUE. When doTitleBox=FALSE, jamba::drawLabels() is called with drawBox=FALSE.

titleColor

character vector of colors applied to title text in each MA-plot panel. When doTitleBox=TRUE and titleColor contains only one or no value, the title color is defined by jamba::setTextContrastColor() along with titleBoxColor.

titleFont

integer font compatible with par("font"). Values are recycled across panels, so each panel can use a custom value if needed.

titlePreset

character string passed to jamba::coordPresets(). The default titlePreset="top" defines the top edge of each panel, and default titleAdjPreset="top" places the title at the top edge of this position.

titleAdjPreset

character string passed to jamba::coordPresets(). The default titleAdjPreset="top" places labels above the titlePreset location, by default above the top edge of each panel. To place the title inside (below) the top edge of the plot panel, use titlePreset="top", titleAdjPreset="bottom", however the title box may overlap and hide data points in the plot panel.

xlab, ylab

character x- and y-axis labels, respectively. The default values are blank "" because there are a wide variety of possible labels, and the labels take up more space than is often useful for most MA-plots.

xlabline, ylabline

numeric number indicating the text line distance from the edge of plot border to place xlab and ylab text, as used by graphics::title().

groupSuffix

(deprecated) character text appended to each MA-plot panel title. Use argment subtitle as the preferred alternative.

doHighlightPolygon

logical indicating whether to draw a shaded polygon encompassing highlightPoints, using each highlightColor.The polygon is defined by grDevices::chull() via the function points2polygonHull().

highlightPolygonAlpha

numeric value indicating alpha transparency used for the highlight polygon when doHighlightPolygon=TRUE, where 0 is fully transparent, and 1 is completely not transparent (opaque).

smoothPtCol

color used to draw points when nrpoints is non-zero, which draws points in the extremities of the smooth scatter plot. See jamba::plotSmoothScatter(). The effect can also be achieved by adjusting transFactor to a lower value, which increases the visual contrast of individual points in the point density.

margins

numeric vector of margins compatible with graphics::par("mar"). Default values are provided here for convenience. Since version 0.0.31.900 the outer_margins are used to control y-axis label whitespace, and and y-axis labels are not displayed on every plot panel when doPar=TRUE.

outer_margins

numeric vector of margins compatible with graphics::par("oma"). Default values are provided here for convenience. The left outer marging is used to allow whitespace to display y-axis labels. Note when useRank=TRUE the y-axis whitespace is increased to accomodate larger integer label values.

useRaster

logical indicating whether to draw the smooth scatter plot using raster logic, useRaster=TRUE is passed to jamba::plotSmoothScatter(). The default TRUE creates a much smaller plot object by rendering each plot panel as a single raster image instead of rendering individual colored rectangles. There is no driving reason to use useRaster=FALSE except if the rasterization process itself is problematic.

doPar

logical indicating whether to apply graphics::par("mfrow") to define MA-plot panel rows and columns. When doPar=FALSE each plot panel is rendered without adjusting the graphics::par("mfrow") setting, which is appropriate for displaying each plot panel individually.

las

integer value 1 or 2 indicating whether axis labels should be parallel or perpendicular to the axes, respectively.

customFunc

optional function used instead of the summary function defined by useMedian during the data centering step. This function should take a matrix of numeric values as input, and must return a numeric vector length equal to nrow(x). This option is intended to allow custom row statistics, for example geometric mean, or other row summary functions.

filterNA, filterNAreplacement

logical and vector respectively. When filterNA=TRUE, all NA values are replaced with filterNAreplacement. This process is conceptually opposite of noise_floor which replaces a numeric value with NA or another numeric value. Instead, filterNA is intended to convert NA values into a known numeric value, typically using an appropriate noise floor value such as zero 0. Practically speaking, NA values should probably be left as NA values, so that data centering does not use these values, and so the MA-plot panel does not draw a point when no measurement exists.

filterNeg

(deprecated) logical argument, use noise_floor.

filterFloor, filterFloorReplacement

(deprecated) in favor of noise_floor, and noise_floor_replacement respectively.

nrpoints

integer or NULL indicating the number of points to display on the extremity of the smooth scatter density, passed to jamba::plotSmoothScatter().

smoothScatterFunc

function used to produce a smooth scatter plot in base R graphics. The default jamba::plotSmoothScatter() controls the level of detail in the density calculation, and in the graphical resolution of that density in each plot panel. The custom function should accept argument transformation as described in transFactor, even if the argument is not used. This argument could be useful to specify customizations not convenient to apply otherwise.

applyRangeCeiling

logical passed to jamba::plotSmoothScatter() which determines how to handle points outside the plot x-axis and y-axis range: applyRangeCeiling=TRUE will place points at the border of the plot, which is helpful to indicate that there are more points outside the viewing range; applyRangeCeiling=FALSE will crop and remove points outside the viewing range, which is helpful for example when a large number of points are at zero and overwhelm the point density. When there are a large proportion of values at zero, it can be helpful to apply xlim=c(0.01, 20) and applyRangeCeiling=FALSE.

doTxtplot

logical (not yet implemented in jamma), indicating to produce colored ANSI text plot output, for example to a text terminal.

ma_method

(deprecated) character string indicating the internal method used for MA-plot calculations:

"old" to use the previous (older) calculation method, which is deprecated and will be removed in future.
"jammacalc" which uses the independent function jammacalc().

panel_hook_function

optional custom function called as a "hook" after each MA-plot panel has been drawn. This function can be used to display custom axis labels, add plot panel visual accents, lines, labels, etc. This panel_hook_function is recycled as a list to the number of samples ncol(x), which can be used to supply a unique function for each panel. Any element in the list with length=0 or NA is skipped for the corresponding MA-plot panel. This function should accept at least two arguments, even if ignored:

i - an integer indicating the sample to be plotted in order, as defined by colnames(x), and whichSamples in the event samples are subsetted or re-ordered with whichSamples.
... additional arguments passed by ... into this custom function.
Any arguments of jammaplot() are available inside the panel hook function as a by-product of calling this function within the environment of the active jammaplot(), therefore any argument values will be available for use inside that function.

titleBoxColor, subtitleBoxColor

character vector of R colors used as background color for each panel title text, or subtitle text respectively. The subtitle appears in the bottom-left corner, and usually indicates the center groups as defined by centerGroups.

Value

list of numeric

matrix objects, one for each MA-plot, with colnames "x" and "y". This list is sufficient input to jammaplot() to re-create the full set of MA-plots.

Details

jammaplot takes a numeric matrix, typically of gene expression data, and produces an MA-plot (Bland-Altman plot), also known as a median-difference plot. One panel is created for each column of data. Within each panel, the x-axis represents the mean or median expression of each row; the y-axis represents the difference from mean or median for that column.

By default, the plot uses jamba::plotSmoothScatter(), with optional highlighted points draw using points().

The function will determine an appropriate layout of plot panels, which can be overridden using ncol and nrow to specify the number of columns and rows of plot panels, respectively. For now, this function uses base R graphics instead of ggplot2, in order to accomodate some custom features.

This function uses "useRaster=TRUE" by default, which causes jamba::plotSmoothScatter() to render a rasterized image as opposed to a composite of colored rectangles. This process substantially reduces the render time in all cases, and reduces the image size when saving as PDF or SVG.

Notable features

Highlighting points

Specific points can be highlighted with argument highlightPoints which can be a vector or named list of vectors, containing rownames(x). When using a list, point colors are assigned to each element in the list in order, using the argument highlightColor.

Centering by control samples

Typical MA-plots are "global-centered", which calculates the mean/median across all columns in x, and this value is subtracted from each individual value per row.

By specifying controlSamples the mean/median is calculated using only the colnames(x) which match controlSamples, thus representing "difference from control."

It may also be useful to center data by known high-quality samples, so the effect of potential outlier samples is avoided.

Centering within subgroups

By specifying centerGroups as a vector of group names, the centering is calculated within each group of colnames(x). In this way, subsets of samples can be treated independently in the MA-plots. A good example might be producing MA-plots for "kidney" samples, and "muscle" samples, which may have fundamentally different signal distributions. A good rule of thumb is to apply centerGroups to represent separate groups of samples where you do not intend to apply direct statistical comparisons across those samples, without at least applying a two-way contrast, a fold change of fold changes.

Another informative technique is to center by sample group, for example centerGroups=sample_group. This technique produces MA-plots that depict the "difference from group" for each sample replicate of a sample group, and is very useful for identifying sample replicates with markedly higher variability to its sample group than others. In general, the variability within sample group should be substantially lower than variability across sample groups. Use displayMAD=TRUE and outlierMAD=2 as a recommended starting point for this technique.

Applying a noise floor

The argument noise_floor provides a numeric lower threshold, where individual values at or below this threshold are set to a defined value, defined by argument noise_floor_value. The default was updated in version 0.0.21.900 to noise_floor=0 and noise_floor_value=NA. Values of zero 0 are set to NA and therefore are not included in the MA-plot calculations. Only points above zero are included as points in each MA-plot panel.

Another useful alternative is to define noise_floor_value=noise_floor which sets any measurement at or below the noise_floor to this value. This option has the effect of reducing random noise from points that are already below the noise threshold and therefore are unreliable for this purpose.

Customizing the panel layout

Panels are drawn using the order of colnames(x) by row, from left-to-right, then top-to-bottom. The argument blankPlotPos is intended to insert an empty panel at a particular panel position, to help customize the alignment of sample panels. This option is typically used with ncol and nrow to define a fixed layout of panel columns and rows. blankPlotPos refers to panels numbered as drawn per row of panels,

Identifying potential sample outliers

Use argument displayMAD=TRUE to display the per-sample MAD factor relative to its centerGroups value, if provided. The MAD value for each MA-plot panel is calculated using rows whose mean is at or above outlierRowMin. The median MAD value is calculated for each centerGroups grouping when groupedMAD=TRUE, by default. Finally, each MA-plot panel MAD factor is the ratio of its MAD value to the relevant median MAD value. MA-plot panels with MAD factor above outlierMAD are considered outliers, and the color ramp uses outlierColramp or outlierColor as a visual cue.

Putative outlier samples should usually not be determined when:

controlSamples are defined to include only a subset of sample groups,
centerGroups is not defined, or represents more than one set of sample groups that are not intended to be statistically compared directly to one another.

Putative outlier samples may be defined when:

centerGroups represents a set of sample groups that are intended to be involved in direct comparisons
centerGroups represents each sample group

Potential sample outliers may be identified by setting a threshold with outlierMAD, by default 5xMAD. For a sample to be considered an outlier, its median difference from mean/median needs to be five times higher than the median across samples.

We typically recommend an outlierMAD=2 when centering by sample groups, or when centering within experiment subsets. For one sample to have 2xMAD factor, its variance needs to be uniquely twice as high as the majority of other samples, which is typically symptomatic of possible technical failure.

There are exceptions to this suggested guideline, which includes scenarios where a batch effect may be involved.

To do:

Accept other object types as input, including Bioconductor classes: ExpressionSet, SummarizedExperiment, MultiExperimentSet
Make it efficient to convey group information, for example define titleBoxColor with group colors, allow centerByGroup=TRUE which would re-use known sample group information.
Adjust the suffix to indicate when centerGroups are being used. For example indicate 'sampleID vs groupA' instead of 'sampleID vs median'.

Functions

ggjammaplot():

Examples

if (jamba::check_pkg_installed("SummarizedExperiment") &&
   jamba::check_pkg_installed("farrisdata")) {
   suppressPackageStartupMessages(require(SummarizedExperiment));

   GeneSE <- farrisdata::farrisGeneSE;

   titleBoxColor <- jamba::nameVector(
      farrisdata::colorSub[as.character(colData(GeneSE)$groupName)],
      colnames(GeneSE));
   options("warn"=FALSE);

   gg <- ggjammaplot(GeneSE,
      ncol=6,
      base_size=12,
      assay_name="raw_counts")

   gg <- ggjammaplot(GeneSE,
      ncol=6,
      assay_name="counts",
      useRank=TRUE,
      ylim=c(-11000, 11000),
      maintitle="MA-plots by rank and rank difference",
      titleBoxColor=titleBoxColor)

   gg <- ggjammaplot(GeneSE,
      ncol=6,
      assay_name="counts",
      titleBoxColor=titleBoxColor,
      base_size=10,
      maintitle="MA-plots showing MAD factor",
      displayMAD=TRUE)

   gg <- ggjammaplot(GeneSE,
      ncol=6,
      assay_name="counts",
      titleBoxColor=titleBoxColor,
      maintitle="MA-plot omitting one panel, then using blankPlotPos",
      whichSamples=colnames(GeneSE)[c(1:21, 23:24)],
      blankPlotPos=22,
      displayMAD=TRUE)

   if (FALSE) {
   ggdf <- ggjammaplot(GeneSE,
      assay_name="counts",
      whichSamples=c(1:3, 7:9),
      return_type="data",
      titleBoxColor=titleBoxColor)
   highlightPoints1 <- names(jamba::tcount(subset(ggdf, mean > 15 & difference < -1)$item, 2))
   highlightPoints2 <- subset(ggdf, name %in% "CA1CB492" &
      difference < -4.5)$item;
   highlightPoints <- list(
      divergent=highlightPoints1,
      low_CA1CB492=highlightPoints2);

   ggdf_h <- ggjammaplot(GeneSE,
      assay_name="counts",
      highlightPoints=highlightPoints,
      whichSamples=c(1:3, 7:9),
      return_type="data",
      titleBoxColor=titleBoxColor)

   # you can use output from `jammaplot()` as input to `ggjammaplot()`:
   jp2 <- jammaplot(GeneSE,
      outlierMAD=2,
      doPlot=FALSE,
      assay_name="raw_counts",
      filterFloor=1e-10,
      filterFloorReplacement=NA,
      centerGroups=colData(GeneSE)$Compartment,
      subtitleBoxColor=farrisdata::colorSub[as.character(colData(GeneSE)$Compartment)],
      useRank=FALSE);

   gg1 <- ggjammaplot(jp2,
      ncol=6,
      titleBoxColor=titleBoxColor);
   print(gg1);
   }
}
#> Warning: Removed 534 rows containing non-finite values (`stat_density2d()`).
#> Warning: Removed 15648 rows containing missing values (`geom_raster()`).

#> Warning: Removed 140 rows containing non-finite values (`stat_density2d()`).
#> Warning: Removed 15648 rows containing missing values (`geom_raster()`).

#> Warning: Removed 322 rows containing non-finite values (`stat_density2d()`).
#> Warning: Removed 15648 rows containing missing values (`geom_raster()`).

#> Warning: Removed 317 rows containing non-finite values (`stat_density2d()`).
#> Warning: Removed 15272 rows containing missing values (`geom_raster()`).


# Note the example data requires the affydata Bioconductor package
if (suppressPackageStartupMessages(require(affydata))) {
   data(Dilution);
   edata <- log2(1+exprs(Dilution));
   jammaplot(edata);

   jammaplot(edata,
      whichSamples=c(1, 2));

   jammaplot(edata,
      sample_labels=paste("Sample", colnames(edata)));

   jammaplot(edata,
      controlIndicator="titlestar");

   jammaplot(edata,
      controlIndicator="none");

   jammaplot(edata,
      panel_hook_function=function(i,...){box("figure")});

   jammaplot(edata,
      useRank=TRUE,
      maintitle="Rank MA-plots");
}