library(DiagrammeR)
20 Example workflow: indoor dust
A walkthrough for indoor dust analysis
Workflow for nontarget using XX
20.1 GC orbitrap HRMS workflow
This code uses XCMS with MF -> CAMERA preprocessing used in : Stettin, D.; Poulin, R.X.; Pohnert, G. Metabolomics Benefits from Orbitrap GC–MS—Comparison of Low- and High-Resolution GC–MS. Metabolites 2020, 10, 143. See the supplementary material for original code. The code was modified here to also include retention time index and also remove zero intensity peaks after peak grouping. Additional metainformation for the msp file can also be added in the “extra” object.
library(processx)
library(xcms)
library(CAMERA)
library(metaMS)
library(dplyr)
library(stringr)
#first define the working directory (folder with experimental group folders inside)
# mzMLfiles <- list.files(path = "/home/ORUNET.ORU.SE/twg/Dioxinlab/GC-Orbitrap data/Florian/Dust NTA 20200513/mzXML/",pattern = ".mzXML", recursive = TRUE, full.names = TRUE)
# Remove files containing "Hex", blank samples
# mzMLfiles <- mzMLfiles[-grep("Hex", mzMLfiles)]
<- readxl::read_xlsx("/home/ORUNET.ORU.SE/twg/Windows_home/Dust_Florian/Sample_list_NTA_dust.xlsx",
sample_info sheet = "Sample")
# filter only samples
<- sample_info %>%
sample_info ::filter(!(group %in% c("blank_solvent")))
dplyr
<- as.character(sample_info$filename)
mzXMLfiles
<- data.frame(sample_name = paste0(sample_info$new_codes, ".mzXML"),
pd class = sample_info$group,
stringsAsFactors = FALSE)
# pd <- pd %>% dplyr::filter(sample_group != "IS") %>% dplyr::filter(sample_group != "RTI")
<- 17 #how many fragments need to be in a group for it to be included in the results. 5 - trace compounds; 20 - mostly high quality spectra
min.clustersize useOriginalCode(TRUE) #otherwise matchedFilter will allocate a huge amount of RAM when using step = 0.001
#Refer to XCMS and CAMERA documentation for information about parameters and functions
#These are the parameters used for high rez GC-Orbitrap data
<- xcmsSet(mzXMLfiles,
xs1 phenoData = pd,
method = "matchedFilter",
fwhm = 3, step = 0.001,
steps = 2, max = 1000,
snthresh = 3, mzdiff = 0.002)
<- group(xs1, method = "density", bw = 5, mzwid = 0.002, minsamp = 1, minfrac = 0, max = 1000)
xs2
<- retcor(xs2, method = "obiwarp")
xs3
<- group(xs3, method = "density", bw = 2, mzwid = 0.6, minsamp = 1, minfrac = 0, max = 1000)
xs4
<- group(xs3, method = "density", bw = 2, mzwid = 0.002, minsamp = 1, minfrac = 0, max = 1000)
xs4
<- fillPeaks(xs4)
xs5
<- xsAnnotate(xs = xs5, sample=NA, polarity = "positive")
xs6
<- groupFWHM(xs6, sigma = 6 , perfwhm = 2, intval = "maxo")
xs7
<- groupCorr(xs7, cor_eic_th = 0.8, calcCaS = TRUE)
xs8
<- getPeaklist(xs8, intval="into")
peaktable write.csv(peaktable, file = "untreated_peaktable_MatchedFilter.csv") #This creates a peaktable .csv file with all m/z features sorted into "compounds"
#the following script identifies all "compounds" considered too small according to min.clustersize
#you don't need to modify anything, that run the next part as is
<- getPeaklist(xs8, intval="maxo")
peaktable <- sort(unique(as.numeric(peaktable$pcgroup)))
pcgroups <- NULL
lspectra <- NULL
extra <- NULL
small.clusters <- NULL
big.clusters <- 1
n for(x in pcgroups) {
<- length(peaktable[peaktable$pcgroup==x,ncol(peaktable)])
clustersize if(clustersize < as.numeric(min.clustersize))
{<- c(small.clusters, x)
small.clusters
}else
{<- c(big.clusters, x)
big.clusters <- peaktable[peaktable$pcgroup==x,]
gruppe1 <- gruppe1[, -(2:(ncol(gruppe1)-3-(length(mzXMLfiles))))]
gruppe1 <- gruppe1[, -(ncol(gruppe1):(ncol(gruppe1)-2))]
gruppe1 <- NULL
decider for(i in 2:ncol(gruppe1))
{<- c(decider, sum(gruppe1[, i]))
decider
}<- which(decider==max(decider), arr.ind = TRUE)
highest <- data.frame(gruppe1[, 1], gruppe1[, highest[1]+1])
gruppe1 colnames(gruppe1) <- c("mz", "int")
<- gruppe1
lspectra[[n]] <- n+1
n
}
}
# remove peaks with zero intensities
<- lapply(lspectra, function(x) {x[x$int != 0,]})
lspectra
<- getReducedPeaklist(xs8, method = "sum", intval = "into", default.adduct.info = "maxint", mzrt.range = FALSE, npeaks.sum = FALSE, cleanup = FALSE)
reduced.peaktable if(is.null(small.clusters)==FALSE) {
for(z in small.clusters)
{<- reduced.peaktable[-(which(reduced.peaktable[, ncol(reduced.peaktable)]==z)), ]
reduced.peaktable
}
}write.csv(reduced.peaktable, file = "peaktable_MatchedFilter.csv") #This creates a peaktable .csv file with only "compounds" instead of m/z features
<- data.frame(Name = paste("Unknown", big.clusters, "RT =",round(reduced.peaktable$rt/60, 2) ), Class = "Unknown", RT = round(reduced.peaktable$rt/60, 2), stringsAsFactors = FALSE)
extra
####KOVATS#####
#Adding Kovats retention index to the extra obejct to write to msp
<- data.frame(rt = extra$RT)
data <- data.frame(Num = c(11, 13, 15, 17, 19, 21, 23, 25),
alkaneSeries rt = c(4.90, 8.00, 11.13, 14.05, 16.70, 19.37, 22.45, 25.77))
<- vector(length = nrow(data))
RI for (i in seq_len(nrow(data))) {
<- dplyr::case_when(
m $rt[i] >= alkaneSeries$rt[1] & data$rt[i] < alkaneSeries$rt[2] ~ alkaneSeries$Num[1],
data$rt[i] >= alkaneSeries$rt[2] & data$rt[i] < alkaneSeries$rt[3] ~ alkaneSeries$Num[2],
data$rt[i] >= alkaneSeries$rt[3] & data$rt[i] < alkaneSeries$rt[4] ~ alkaneSeries$Num[3],
data$rt[i] >= alkaneSeries$rt[4] & data$rt[i] < alkaneSeries$rt[5] ~ alkaneSeries$Num[4],
data$rt[i] >= alkaneSeries$rt[5] & data$rt[i] < alkaneSeries$rt[6] ~ alkaneSeries$Num[5],
data$rt[i] >= alkaneSeries$rt[6] & data$rt[i] < alkaneSeries$rt[7] ~ alkaneSeries$Num[6],
data$rt[i] >= alkaneSeries$rt[7] & data$rt[i] <= alkaneSeries$rt[8] ~ alkaneSeries$Num[7]
data
)
<- dplyr::case_when(
n $rt[i] >= alkaneSeries$rt[1] & data$rt[i] < alkaneSeries$rt[2] ~ alkaneSeries$Num[2],
data$rt[i] >= alkaneSeries$rt[2] & data$rt[i] < alkaneSeries$rt[3] ~ alkaneSeries$Num[3],
data$rt[i] >= alkaneSeries$rt[3] & data$rt[i] < alkaneSeries$rt[4] ~ alkaneSeries$Num[4],
data$rt[i] >= alkaneSeries$rt[4] & data$rt[i] < alkaneSeries$rt[5] ~ alkaneSeries$Num[5],
data$rt[i] >= alkaneSeries$rt[5] & data$rt[i] < alkaneSeries$rt[6] ~ alkaneSeries$Num[6],
data$rt[i] >= alkaneSeries$rt[6] & data$rt[i] < alkaneSeries$rt[7] ~ alkaneSeries$Num[7],
data$rt[i] >= alkaneSeries$rt[7] & data$rt[i] <= alkaneSeries$rt[8] ~ alkaneSeries$Num[8]
data
)
<- round(100*n + 100*(m-n) * (data$rt[i] - alkaneSeries[alkaneSeries$Num == n,]$rt)/(alkaneSeries[alkaneSeries$Num == m,]$rt - alkaneSeries[alkaneSeries$Num == n,]$rt), 0)
RI[i]
}
<- cbind(extra, RI)
extra
######END KOVATS######
# create the msp object with spectra and all meta information in the extra object (NAME, RETENTIONTIME, RETENTIONINDEX)
<- construct.msp(lspectra, extra)
export.msp
write.msp(export.msp, file = "spectra_MatchedFilter20201030.msp") #This creates a NIST MS Search compatible .msp file with all compound pseudospectra
This code uses XCMS with centWave. Check differences and skip first steps to combine both workflows
#first define the working directory (folder with experimental group folders inside)
# mzMLfiles <- list.files(path = "/mzXML/",pattern = ".mzXML", recursive = TRUE, full.names = TRUE)
# Remove files containing "Hex", blank samples
# mzMLfiles <- mzMLfiles[-grep("Hex", mzMLfiles)]
<- readxl::read_xlsx("Sample_list_NTA_dust.xlsx",
sample_info sheet = "Sample_xcms")
# filter only samples
<- sample_info %>%
sample_info ::filter(!(group %in% c("blank_solvent")))
dplyr
<- data.frame(sample_name = paste0(sample_info$analysis, ".mzXML"),
pd sample_group = sample_info$group,
stringsAsFactors = FALSE)
<- paste0(sample_info$path, "/", sample_info$analysis, ".mzXML")
mzXMLfiles
# Use subset to test
<- pd[1:2,]
pd <- mzXMLfiles[1:2]
mzXMLfiles
<- readMSData(mzXMLfiles,
raw_data pdata = new("NAnnotatedDataFrame", pd),
mode = "onDisk")
# pd <- pd %>% dplyr::filter(sample_group != "IS") %>% dplyr::filter(sample_group != "RTI")
#-----Find features----#
# centWave params
<- CentWaveParam(ppm = 5,
cwp peakwidth = c(3, 45),
snthresh = 10,
prefilter = c(1,30000),
mzCenterFun = "wMean",
integrate = 1L,
mzdiff = -0.001,
fitgauss = FALSE,
noise = 30000,
verboseColumns = FALSE)
<- findChromPeaks(mzXMLfiles, param = cwp)
xs1
# use model peak to evaluate process
<- c(1440, 1460)
rtr <- c(340.238, 340.242)
mzr <- chromatogram(mzXMLfiles, mz = mzr, rt = rtr)
chr_raw <- chromatogram(xs1, mz = mzr, rt = rtr)
chr_mzr <- paste0(brewer.pal(3, "Set1")[1:2], "60")
group_colors <- group_colors[xs1$sample_group]
sample_colors
#----Group features 1----#
<- PeakDensityParam(
pdp sampleGroups = pd$sample_group,
bw = 10,
minFraction = 0,
minSamples = 1,
binSize = 0.002,
maxFeatures = 1000)
<- groupChromPeaks(xs1, param = pdp)
xs2
plotChromPeakDensity(chr_mzr, col = sample_colors, param = pdp)
#----retention time correction----#
<- ObiwarpParam(binSize = 0.05, # need to check optimal binSize
obip centerSample = integer(),
response = 1L,
distFun = "cor_opt",
gapInit = numeric(),
gapExtend = numeric(),
factorDiag = 2,
factorGap = 1,
localAlignment = FALSE,
initPenalty = 0,
subset = integer(),
subsetAdjust = c("average", "previous"))
<- adjustRtime(xs2, param = obip)
xs3
# Get the base peak chromatograms
<- chromatogram(xs3, aggregationFun = "max", include = "none")
bpis_adj par(mfrow = c(2, 1), mar = c(4.5, 4.2, 1, 0.5))
plot(bpis_adj)
# Plot also the difference of adjusted to raw retention time.
plotAdjustedRtime(xs3)
par(mfrow = c(2, 1))
## Plot the raw data
plot(chr_raw)
## Extract the chromatogram from the adjusted object
<- chromatogram(xs3, rt = rtr, mz = mzr)
chr_adj plot(chr_adj, peakType = "none")
#----Group features 2----#
# After retention time correction, the rt values are modified and additional grouping is needed.
<- PeakDensityParam(
pdp2 sampleGroups = pd$sample_group,
bw = 5,
minFraction = 0,
minSamples = 1,
binSize = 0.2,
maxFeatures = 1000)
<- groupChromPeaks(xs3, param = pdp2)
xs4
<- chromatogram(xs4, mz = mzr, rt = rtr)
xs4_chrom plotChromPeakDensity(xs4_chrom, col = sample_colors, param = pdp2)
# iteratively decrease the binSize
<- PeakDensityParam(
pdp3 sampleGroups = pd$sample_group,
bw = 2,
minFraction = 0,
minSamples = 1,
binSize = 0.002,
maxFeatures = 1000)
<- groupChromPeaks(xs4, param = pdp3)
xs4
<- chromatogram(xs4, mz = mzr, rt = rtr)
xs4_chrom plotChromPeakDensity(xs4_chrom, col = sample_colors, param = pdp3)
#----Fill missing features----#
<- fillChromPeaks(xs4)
xs5
## Check missing values before filling in peaks
apply(featureValues(xs5, filled = FALSE), MARGIN = 2,
FUN = function(z) sum(is.na(z)))
## Missing values after filling in peaks
apply(featureValues(xs5), MARGIN = 2,
FUN = function(z) sum(is.na(z)))
#---- QC ----#
# Extract chromatograms of 4 features
<- featureChromatograms(xs5, features = 10000:10004)
feature_chroms
plot(feature_chroms)
boxplot(featureValues(xs5, value="into") +1,
#col=as.numeric(sampclass(mtbls2Set))+1,
log="y", las=2)
<- 4.0 ## Filter low-standard deviation rows for plot
sdThresh <- log(featureValues(xs5))+1
data <- pca(data, nPcs=3)
pca.result plotPcs(pca.result, type="loadings",
#col=as.numeric(sampclass(mtbls2Set))+1
)
# It is possible to use the retention time correction and grouping step in an iterative way if needed.
# Once you perform your last adjustRtime step and thus your last grouping step, you will obtain your final peak list (i.e. final list of ions)
#----Annotation using CAMERA----#
# Since CAMERA has not yet been ported to XCMSnExp,we need to convert to xcmsSet.
# Note that the conversion only makes sense for somple XCMSnSets,
# without e.g. MS level filtering (where CAMERA would then extract the wrong peaks)
<- as(xs5, "xcmsSet")
xs6
<- xsAnnotate(xs = xs6, sample=NA, polarity = "positive")
xs6
<- groupFWHM(xs6, sigma = 6 , perfwhm = 2, intval = "maxo")
xs7
<- groupCorr(xs7, cor_eic_th = 0.8, calcCaS = TRUE)
xs8
#how many fragments need to be in a group for it to be included in the results. 5 - trace compounds; 20 - mostly high quality spectra
<- 20
min.clustersize
<- getPeaklist(xs8, intval="into")
peaktable write.csv(peaktable, file = "test_peaktable.csv") #This creates a peaktable .csv file with all m/z features sorted into "compounds"
#the following script identifies all "compounds" considered too small according to min.clustersize
<- getPeaklist(xs8, intval="maxo")
peaktable
# Replaces NA in intensities with zero
<- peaktable %>%
peaktable ::mutate(dplyr::across(dplyr::starts_with("X"), ~tidyr::replace_na(., 0)))
dplyr
<- sort(unique(as.numeric(peaktable$pcgroup)))
pcgroups <- NULL
lspectra <- NULL
extra <- NULL
small.clusters <- NULL
big.clusters <- 1
n for(x in seq_along(pcgroups)) {
<- length(peaktable[peaktable$pcgroup==x,ncol(peaktable)])
clustersize if(clustersize < as.numeric(min.clustersize))
{<- c(small.clusters, x)
small.clusters
}else
{<- c(big.clusters, x)
big.clusters <- peaktable[peaktable$pcgroup==x,]
gruppe1 <- gruppe1[, -(2:(ncol(gruppe1)-3-(length(mzXMLfiles))))]
gruppe1 <- gruppe1[, -(ncol(gruppe1):(ncol(gruppe1)-2))]
gruppe1 <- NULL
decider
for(i in 2:ncol(gruppe1))
{<- c(decider, sum(gruppe1[, i]))
decider
}<- which(decider==max(decider), arr.ind = TRUE)
highest <- data.frame(gruppe1[, 1], gruppe1[, highest[1]+1])
gruppe1 colnames(gruppe1) <- c("mz", "int")
<- gruppe1
lspectra[[n]] <- n+1
n
}
}
# remove peaks with zero intensities
<- lapply(lspectra, function(x) {x[x$int != 0,]})
lspectra
<- getReducedPeaklist(xs8, method = "sum", intval = "into", default.adduct.info = "maxint", mzrt.range = FALSE, npeaks.sum = FALSE, cleanup = FALSE)
reduced.peaktable if(is.null(small.clusters)==FALSE) {
for(z in small.clusters)
{<- reduced.peaktable[-(which(reduced.peaktable[, ncol(reduced.peaktable)]==z)), ]
reduced.peaktable
}
}write.csv(reduced.peaktable, file = "test_peaktable.csv") #This creates a peaktable .csv file with only "compounds" instead of m/z features
<- data.frame(Name = paste("Unknown", big.clusters, "RT =",round(reduced.peaktable$rt/60, 2) ), Class = "Unknown", RT = round(reduced.peaktable$rt/60, 2), stringsAsFactors = FALSE)
extra
####KOVATS#####
#Adding Kovats retention index to the extra obejct to write to msp
<- data.frame(rt = extra$RT)
data <- data.frame(Num = c(11, 13, 15, 17, 19, 21, 23, 25),
alkaneSeries rt = c(4.90, 8.00, 11.13, 14.05, 16.70, 19.37, 22.45, 25.77))
<- vector(length = nrow(data))
RI for (i in seq_len(nrow(data))) {
<- dplyr::case_when(
m $rt[i] >= alkaneSeries$rt[1] & data$rt[i] < alkaneSeries$rt[2] ~ alkaneSeries$Num[1],
data$rt[i] >= alkaneSeries$rt[2] & data$rt[i] < alkaneSeries$rt[3] ~ alkaneSeries$Num[2],
data$rt[i] >= alkaneSeries$rt[3] & data$rt[i] < alkaneSeries$rt[4] ~ alkaneSeries$Num[3],
data$rt[i] >= alkaneSeries$rt[4] & data$rt[i] < alkaneSeries$rt[5] ~ alkaneSeries$Num[4],
data$rt[i] >= alkaneSeries$rt[5] & data$rt[i] < alkaneSeries$rt[6] ~ alkaneSeries$Num[5],
data$rt[i] >= alkaneSeries$rt[6] & data$rt[i] < alkaneSeries$rt[7] ~ alkaneSeries$Num[6],
data$rt[i] >= alkaneSeries$rt[7] & data$rt[i] <= alkaneSeries$rt[8] ~ alkaneSeries$Num[7]
data
)
<- dplyr::case_when(
n $rt[i] >= alkaneSeries$rt[1] & data$rt[i] < alkaneSeries$rt[2] ~ alkaneSeries$Num[2],
data$rt[i] >= alkaneSeries$rt[2] & data$rt[i] < alkaneSeries$rt[3] ~ alkaneSeries$Num[3],
data$rt[i] >= alkaneSeries$rt[3] & data$rt[i] < alkaneSeries$rt[4] ~ alkaneSeries$Num[4],
data$rt[i] >= alkaneSeries$rt[4] & data$rt[i] < alkaneSeries$rt[5] ~ alkaneSeries$Num[5],
data$rt[i] >= alkaneSeries$rt[5] & data$rt[i] < alkaneSeries$rt[6] ~ alkaneSeries$Num[6],
data$rt[i] >= alkaneSeries$rt[6] & data$rt[i] < alkaneSeries$rt[7] ~ alkaneSeries$Num[7],
data$rt[i] >= alkaneSeries$rt[7] & data$rt[i] <= alkaneSeries$rt[8] ~ alkaneSeries$Num[8]
data
)
<- round(100*n + 100*(m-n) * (data$rt[i] - alkaneSeries[alkaneSeries$Num == n,]$rt)/(alkaneSeries[alkaneSeries$Num == m,]$rt - alkaneSeries[alkaneSeries$Num == n,]$rt), 0)
RI[i]
}
<- cbind(extra, RI)
extra
######END KOVATS######
# create the msp object with spectra and all meta information in the extra object (NAME, RETENTIONTIME, RETENTIONINDEX)
<- construct.msp(lspectra, extra)
export.msp
write.msp(export.msp, file = "test_spectra.msp") #This creates a NIST MS Search compatible .msp file with all compound pseudospectra
20.2 LC-HRMS using Waters MSe DIA workflow
20.3 Workflow for centroiding of raw file, peak picking and correlation of MS1 and MS2
20.4 First convert unifi data using MSConvert
Options: output format: mzML
binary encoding precision: 64-bit
Write index: check
TPP compatibility: check
use zlib compression: uncheck
package in gzip: uncheck
Filters:
1. TitleMaker
2. msLevel: 1-2 (need to click “Add” to add filter)