##requirements:
    ## instalation of vsearch, Usearch, cutadapt, blast
    ## vsearch here: https://github.com/torognes/vsearch
    ## usearch here: http://drive5.com/usearch/manual/install.html
    ## cutadapt here: http://cutadapt.readthedocs.io/en/stable/installation.html
    ## blast here: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.4.0/ncbi-blast-2.4.0+.dmg
    ## blast here: https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download


## libraries needed

library(ShortRead)
library("parallel")
library("foreach")
library("doParallel")
library(vegan)
library(stringr) #for extracting words
library(VennDiagram)
library(reshape)
library(ggplot2)


##General purpose functions:
setPaths <- function(user = Sys.info()[7])
	{
	##Convenience function used to set the paths to software based on the user.
	##Define the code necessary to run usearch, vsearch, cutadapt, and blast plus the paths for blast database directory.
	
        paths <- list(usearch= "XXX" , #put path to downloaded Usearch tool instead of XXX
 			vsearch= "XXX" , #put path to downloaded vsearch tool instead of XXX
			cutadapt = "XXX" , #put path to downloaded cutadapt tool instead of XXX
			blast = "XXX" , #put path to downloaded blast tool instead of XXX
			dbPath = "XXX" , #put path to merged sequences database instead of XXX
			dbNCBIPath = "XXX" , #put path to downloaded NCBI RefSeq ITS database instead of XXX
			dbUNITEPath =  "XXX" , #put path to downloaded UNITE database instead of XXX
			blastdb.path = "XXX" #put path to downloaded makeblastdb command instead of XXX)

	return(paths)
	}	


writeFast <- function(seqs, desc, file)
    #Much faster function to write a fasta-like format without any line breaks within sequences
    {
    fasta <- c()
    seqs <- as.character(seqs)
    desc <- as.character(desc)
    if(substr(desc[1],1,1)!=">")
        {
        desc <- paste(">", desc, sep="")
        }
    fasta[seq(1,length(seqs)*2, by=2)] <- desc
    fasta[seq(2,length(seqs)*2, by=2)] <- seqs
    write(fasta, file=file)
    }

 
readFAST <- function(file)
    { ##the function readFASTA in Biostrings is slow as based on a loop. This is faster...but not by tons (10% faster)
    in.file <- readLines(file,  encoding="latin1")
    in.file <- iconv(in.file, from="latin1", to="ASCII", ".")
    desc.lines <- grep(">", in.file)

    seq.lines <- data.frame(desc.lines+1, stop.line = c(desc.lines[-1]-1, length(in.file)))
    sequence.line.list <- apply(seq.lines, 1, function(x) seq(x[1], x[2]))
    if(class( sequence.line.list) == "matrix")
  	{
    	sequence.line.list <- split(t(sequence.line.list), f=in.file[desc.lines])
    	descs <- names(sequence.line.list)
  	} else {
    	descs <- gsub(">", "", in.file[desc.lines])
  	}
    sequences <- lapply(sequence.line.list, function(x) paste(in.file[x], collapse=""))
    list(descs = descs, sequences = sequences)
            }
            
iupacGrep <- function(pattern,x,...)
    #Convenience package, this uses regular expressions to run grep with IUPAC codes
    {
    pattern <- gsub("R", "[A|G]", pattern)
    pattern <- gsub("Y", "[C|T]", pattern)
    pattern <- gsub("S", "[G|C]", pattern)
    pattern <- gsub("W", "[A|T]", pattern)
    pattern <- gsub("K", "[G|T]", pattern)
    pattern <- gsub("M", "[A|C]", pattern)
    pattern <- gsub("B", "[C|G|T]", pattern)
    pattern <- gsub("D", "[A|G|T]", pattern)
    pattern <- gsub("H", "[A|C|T]", pattern)
    pattern <- gsub("V", "[A|C|G]", pattern)
    pattern <- gsub("N", "[A|C|T|G]", pattern)
    pattern <- gsub("I", "[A|C|T|G]", pattern)  ##Note that Deoxyinosine is not a true universal match, see http://sg.idtdna.com/site/Catalog/Modifications/Category/7
    grep(pattern, x, ...)
    }

iupacSplit <- function(x, pattern,...)
    #Convenience package, this uses regular expressions to run grep with IUPAC codes
    {
    pattern <- gsub("R", "[A|G]", pattern)
    pattern <- gsub("Y", "[C|T]", pattern)
    pattern <- gsub("S", "[G|C]", pattern)
    pattern <- gsub("W", "[A|T]", pattern)
    pattern <- gsub("K", "[G|T]", pattern)
    pattern <- gsub("M", "[A|C]", pattern)
    pattern <- gsub("B", "[C|G|T]", pattern)
    pattern <- gsub("D", "[A|G|T]", pattern)
    pattern <- gsub("H", "[A|C|T]", pattern)
    pattern <- gsub("V", "[A|C|G]", pattern)
    pattern <- gsub("N", "[A|C|T|G]", pattern)
    pattern <- gsub("I", "[A|C|T|G]", pattern)  ##Note that Deoxyinosine is not a true universal match, see http://sg.idtdna.com/site/Catalog/Modifications/Category/7
    strsplit( x, pattern, ...)
    }
    
iupacAgrep <- function(pattern, x, ...)
    #Convenience package, this uses regular expressions to run grep with IUPAC codes
    {
    pattern <- gsub("R", "[A|G]", pattern)
    pattern <- gsub("Y", "[C|T]", pattern)
    pattern <- gsub("S", "[G|C]", pattern)
    pattern <- gsub("W", "[A|T]", pattern)
    pattern <- gsub("K", "[G|T]", pattern)
    pattern <- gsub("M", "[A|C]", pattern)
    pattern <- gsub("B", "[C|G|T]", pattern)
    pattern <- gsub("D", "[A|G|T]", pattern)
    pattern <- gsub("H", "[A|C|T]", pattern)
    pattern <- gsub("V", "[A|C|G]", pattern)
    pattern <- gsub("N", "[A|C|T|G]", pattern)
    pattern <- gsub("I", "[A|C|T|G]", pattern)  ##Note that Deoxyinosine is not a true universal match, see http://sg.idtdna.com/site/Catalog/Modifications/Category/7
    agrep(pattern, x, ...)
    }
    
#DNA functions to reverse or reverse-complement strings
strrev <- function(x) paste(rev(strsplit(x, "")[[1]]), collapse = "")

strcomp <- function(x) paste(c("A","T","G","C","Y","R","S","W","K","M","B","D","H","V","N", ".")[match(strsplit(x, "")[[1]], 
                                c("T","A","C","G","R","Y","S","W","M","K","V","H","D","B","N", "." ))], collapse = "")

strrevcomp <- function(x) paste(c("A","T","G","C","Y","R","S","W","K","M","B","D","H","V","N", ".")[match(rev(strsplit(x, "")[[1]]), 
                                c("T","A","C","G","R","Y","S","W","M","K","V","H","D","B","N", "." ))], collapse = "")


paths <- setPaths()


#additional paths


original.seqs<- "XXX" , #put path processed trimmed ITS sequences instead of XXX
dire <- "XXX" #put path to folder above original.seqs instead of XXX
forwardPrimer <- "AACTTTYRRCAAYGGATCWCT"
reversePrimer <- "AGCCTCCGCTTATTGATATGCTTAART"


### more paths and filenames
mergedf2oldastq <- "merged_reads.fastq"
notmergedf2oldastq <- "not_merged.fastq"
fwdtrimmed <- "trim_merge_reads_fwd.fastq"
trimmed <-    "trim_merge_reads.fastq"
trimmed.fa <- "trim_merge_reads.fa"
filtered <- "filter_trim_merge_reads.fa"
trimseqOTU <- "trim_match_to_Otus.txt"
trimseqZOTU <- "trim_match_to_Zotus.txt"


setwd(original.seqs)

#put sample name in every sequence identificator in every file in cdw - list files reads into store, chagnes every 4th line starting with first and writes a new file in a directory above
sapply(list.files(pattern="_001.fastq"), function(x) {
			setwd(original.seqs)
			store <- readLines(x)
			store[seq(1,length(readLines(x)), by= 4)] <- paste(store[seq(1,length(readLines(x)), by= 4)], ":sample:", sub("-|_", "", substr(x,11,14)), sep = "")
			setwd(dire)
			writeLines(store, paste("sampled_", sub(".trimmed", "", x), sep=""))
			})


#merging
system(paste(paths$usearch, " -fastq_mergepairs *_R1_001.fastq -fastqout ", mergedf2oldastq, " -fastqout_notmerged_fwd ", notmergedf2oldastq ,sep=""))

#cuts primers
system(paste(paths$cutadapt, " -g ^", forwardPrimer, " -o  ", fwdtrimmed, "  ", mergedf2oldastq, sep=""))
system(paste(paths$cutadapt, " -a ", strrevcomp(reversePrimer), " -o ", trimmed, "  ",  fwdtrimmed, sep=""))
#converts fastq trimemed to fasta 
writeFasta(readFastq(trimmed), trimmed.fa)

#max expected error 
maxee <- 1.0 
#minimal sequence length
 minseqlength <- 200
# default minimal abundance of a sequence to be considered OTU
 minsize = 2

#filters low quality merged sequences
system(paste(paths$vsearch," -fastq_filter ", trimmed, " -fastq_maxee ",maxee," -fastaout  ",filtered, sep=""))   


#finds unique sequences
system(paste(paths$vsearch," -derep_fulllength  ", filtered, "  -relabel Uniq --minseqlength ", minseqlength," -sizeout -fastaout uniques.fa -output uniques.fa", sep= ""))

#cluster otus with Uparse out algorithm #this has build in chimera detection
system(paste(paths$usearch, " -cluster_otus uniques.fa -minsize ", minsize," -otus otus.fa -relabel Otu -uparseout ",dire,"uparse.out.txt", sep=""))

#gets rid of line breaks and stuff in otus.fa
writeFast(readFAST("otus.fa")[[2]], readFAST("otus.fa")[[1]], "otus.fasta")


# assigns otus to sequences from trimmed merged sequences (needs to be fasta), increase the sensitivity allowing filtered reads to map,  and produces a huge file trimseqOTU 
system(paste(paths$vsearch," --usearch_global ", trimmed.fa ," -db otus.fa -strand plus -id 0.97 --blast6out ",trimseqOTU, sep=""))


# from the "sequence_match_to_Otus.txt" file make a table with OTUS samples and sequence frequencies
#extract sample name 
samples <- sapply(read.csv(trimseqOTU, sep = "\t", header =F)[,1], function(x) tail(unlist(strsplit(as.character(x), split = ":")), n=1))
otus <- read.csv(trimseqOTU, sep = "\t", header =F)[,2]


otusam<-as.data.frame.matrix(table(otus, samples))

#creates a data frame with otus and sequences
OtusFasta <- data.frame(descs = unlist(readFAST("otus.fasta")[[1]]), sequences =unlist(readFAST("otus.fasta")[[2]]))


#detects cores and leaves one for idle processes
cores <- parallel::detectCores()-1
        
    if(length(OtusFasta$descs) < 50)
        {
        cores <- 1
        }
    blockstarts <- floor(seq(1, length(OtusFasta$descs), length.out=cores+1))[-(cores+1)]
    blockends <- c(blockstarts[-1]-1,length(OtusFasta$descs))
    print(paste("Running", cores, "cores."))
 
database <-"/sh_general_release_dynamic_22.08.2016.fasta"

# creates temporary files with blast results
for(core.i in 1:cores)
        {
        writeFast(unlist(OtusFasta$sequences[blockstarts[core.i]:blockends[core.i]]),OtusFasta$descs[blockstarts[core.i]:blockends[core.i]], file=paste(dire, "tempFasta",core.i,".fasta", sep=""))
        }
    print(paste("file divided into N =",cores,"parts as tempFastaN.fasta in",dire))
    cl <- makeCluster(cores)
    registerDoParallel(cl, cores=cores)
    blastpath <- setPaths()$blast
    dbpath <-  setPaths()$dbUNITEPath
    setwd(dbpath)
    tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
        {

system(paste(blastpath, " -query  ", dire, "tempFasta",core.i, ".fasta -db ", dbpath, database," -task blastn -dust no -outfmt '7 stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_alignments 1 -num_descriptions 1 -out ", dire,"OTU_UNITEold_Blast_matching",
        core.i, ".txt", sep=""))

        }
    closeAllConnections()
    print("Blast run complete, wrapping up dataFrame")
    
#create data.frame df2old with data from temporary files - otu, otuMatch, identity, length, sequence
temp <- c() 
for(core.i in 1:cores)
	{  
	temp <- append(temp, readLines(paste(dire, "OTU_UNITEold_Blast_matching", core.i, ".txt", sep="")))
	}


matches <- grep("Query", temp)+4
df2old <- c()
df2old$otu <- sub("# Query: ", "", temp[matches-4])   #OtusFasta$descs
df2old$otuMatchUNITE <- sapply(temp[matches], function(x) as.character(unlist(strsplit(x, "[\t]"))[1]))
df2old$identityUNITE <- sapply(temp[matches], function(x) as.numeric(unlist(strsplit(x, "\t"))[2]))
df2old$lengthUNITE <- 	sapply(temp[matches], function(x) as.numeric(unlist(strsplit(x, "\t"))[3]))
df2old$sequence <- OtusFasta$sequences[match(df2old$otu, OtusFasta$descs)] 
df2old$lineageUNITE <- sapply(df2old$otuMatchUNITE, function(x) as.character(unlist(strsplit(x, "[|]"))[5]))
z <- unname(strsplit(df2old$lineageUNITE, split = ";"))
df2old$speciesUNITE <- gsub('^.{0,3}', '',sapply(z, "[[", 7 ))
df2old$genusUNITE <- gsub('^.{0,3}', '',sapply(z, "[[", 6 ))


################### ncbi adding
database <-"/fungi.ITS.fna"
cl <- makeCluster(cores)
registerDoParallel(cl, cores=cores)
blastpath <- setPaths()$blast
dbpath <-  setPaths()$dbNCBIPath
setwd(dbpath)
tempOtuBlast <- foreach(core.i = 1:cores, .packages = c("stats"), .combine=rbind) %dopar% 
	{
	system(paste(blastpath, " -query ", dire, "tempFasta",core.i, ".fasta -db ", dbpath, database,
	" -task blastn -dust no -outfmt '7 stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_alignments 1 -num_descriptions 1 -out ", dire,"OTU_NCBI_Blast_matching",
	core.i, ".txt", sep=""))
	}
closeAllConnections()
print("Blast run complete, wrapping up dataFrame")
    
#temp blast results
temp <- c() 
for(core.i in 1:cores)
	{  
	temp <- append(temp, readLines(paste(dire, "OTU_NCBI_Blast_matching", core.i, ".txt", sep="")))
	}

matches <- grep("Query", temp)+4


#add ncbi blast results to respective df2old columns
df2old$identityNCBI <- sapply(temp[matches], function(x) as.numeric(unlist(strsplit(x, "\t"))[2]))
df2old$lengthNCBI   <- sapply(temp[matches], function(x) as.numeric(unlist(strsplit(x, "\t"))[3]))
df2old$otuMatchNCBI <- sapply(temp[matches], function(x) as.character(unlist(strsplit(x, "[\t]"))[1]))
df2old$speciesNCBI  <- paste(word(temp[matches], 2), word(temp[matches],3), sep ="_")
df2old$genusNCBI    <- word(temp[matches], 2)


#final sci names are based on UNITE results

scinames <- as.character(df2old$speciesUNITE)

length <- df2old$lengthUNITE
identity <- df2old$identityUNITE
#insert NCBI species name for otus with incomplete sp name (GENUS_sp) for  those with same genus name from both db
scinames[intersect(grep("_sp$", df2old$speciesUNITE), which(df2old$genusUNITE==df2old$genusNCBI))] <- df2old$speciesNCBI[intersect(grep("_sp$", df2old$speciesUNITE), which(df2old$genusUNITE==df2old$genusNCBI))]
length[intersect(grep("_sp$", df2old$speciesUNITE), which(df2old$genusUNITE==df2old$genusNCBI))] <- df2old$lengthNCBI[intersect(grep("_sp$", df2old$speciesUNITE), which(df2old$genusUNITE==df2old$genusNCBI))]
identity[intersect(grep("_sp$", df2old$speciesUNITE), which(df2old$genusUNITE==df2old$genusNCBI))] <- df2old$identityNCBI[intersect(grep("_sp$", df2old$speciesUNITE), which(df2old$genusUNITE==df2old$genusNCBI))]

#insert NCBI species name for otus with only family orderm class or phylum 
scinames[grep("Fungi_sp|aceae_sp$|ales_sp$|cetes_sp$|cota_sp$|Incertae_sedis_sp$", df2old$speciesUNITE)]<- df2old$speciesNCBI[grep("Fungi_sp|aceae_sp$|ales_sp$|cetes_sp$|cota_sp$|Incertae_sedis_sp$", df2old$speciesUNITE)]

length[grep("Fungi_sp|aceae_sp$|ales_sp$|cetes_sp$|cota_sp$|Incertae_sedis_sp$", df2old$speciesUNITE)]<- df2old$lengthNCBI[grep("Fungi_sp|aceae_sp$|ales_sp$|cetes_sp$|cota_sp$|Incertae_sedis_sp$", df2old$speciesUNITE)]

identity[grep("Fungi_sp|aceae_sp$|ales_sp$|cetes_sp$|cota_sp$|Incertae_sedis_sp$", df2old$speciesUNITE)]<- df2old$identityNCBI[grep("Fungi_sp|aceae_sp$|ales_sp$|cetes_sp$|cota_sp$|Incertae_sedis_sp$", df2old$speciesUNITE)]


# resulting genera are the first word of scinames
genus <- sapply(scinames, function(x) unlist(strsplit(x, "_"))[1] )


#store entire UNITE database
unite <- readLines(paste(setPaths()$dbUNITEPath,"/sh_general_release_dynamic_22.08.2016.fasta", sep =""))

#resulting lineage based on search in UNITE db for matches with genus name
lineage <- sapply(genus, function(x) as.character(unlist(strsplit(grep(x, unite, val =T)[1], "[|]"))[5]))


# deal with few instances where genus is not found in the unite db and support lineage from UNITE 
lineage[which(is.na(lineage))] <- as.character(df2old$lineageUNITE)[which(is.na(lineage))] 

z <- unname(strsplit(lineage, split = ";"))

df2old$species <- scinames
df2old$genus <- genus
df2old$identity <- identity
df2old$length <- length
#extract respective list item and discard first 3 symbols
df2old$family <-  gsub('^.{0,3}', '',sapply(z, "[[", 5 ))
df2old$order <- gsub('^.{0,3}', '',sapply(z, "[[", 4))
df2old$class <- gsub('^.{0,3}', '',sapply(z, "[[", 3 ))
df2old$phylum <- gsub('^.{0,3}', '',sapply(z, "[[", 2 ))

#make a data.frame
df2old <- as.data.frame(df2old)

#name rows by Otus
row.names(df2old) <- df2old$otu

setwd(dire)


#reorder otusam according to df2old
otusam <- otusam[match(df2old$otu,row.names(otusam)),]


# add samples to df2old with frequencies of OTUs
df2old.exp <- cbind(df2old, otusam)


# create column with otu frequencies with both controls
df2old.exp$otuall <- rowSums(otusam)

# create column with otu frequencies without the negative control
df2old.exp$otufreq <- df2old.exp$otuall - df2old.exp$NC2
# create column with otu frequencies without both controls
df2old.exp$otufreq_samples <-  df2old.exp$otufreq - df2old.exp$PC2
dnes <- format(Sys.time(), "%Y%m%d")
write.csv(df2old.exp, paste("table_df2old_", dnes, ".csv", sep =""))


setwd(dire)

df2old.exp <- read.csv(tail(list.files(pattern="table_df2old_2018"),1), na.strings=c("", " ", NA))[,-1]


#clean conatminations based on beauveria 
#samples likely to be contaminated 
bov <- sort(c(0:23*4+1, 0:23*4+2,80, 95))
#classifier
conta <- rep("clear", 96)
conta[bov] <- "cont"


otusam.prp <- prop.table(as.matrix(otusam[,-c(97,98)]),2)
#which are statistically higher/lower 
kont.abs <- which(apply(otusam[,-c(97,98)],1, function(x) summary(aov(x~conta))[[1]]$"Pr(>F)"[1])<0.01)


#negative control to sample ratio
NC.samp.r <- 30
#positive control to sample ratio
PC.samp.r <- 0.3


# no need to trimm pc2 fatsa as is amplified with giTS7 abd ITS4
system(paste(paths$blast, " -query ", dire, "PC2.fasta -db ", dire, "otus.fasta -task blastn -dust no -outfmt '7 stitle pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_alignments 1 -num_descriptions 1 -out ", dire,"PC2_match_to_Otus.txt", sep=""))


#OTUs that do not follow thresholds based for negative or positive control 
kontrols <- sort(union(which(df2old.exp$otufreq_samples/df2old.exp$NC2<=NC.samp.r), which(df2old.exp$otufreq_samples/df2old.exp$PC2<=PC.samp.r)))

#plant OTUs
plantae <- sort(union(grep("Plant", df2old.exp$otuMatchUNITE), grep("Plant", df2old.exp$otuMatchNCBI))) 


# low abundance OTUs - quintupletons and lower
low.abundance  <- which(df2old.exp$otufreq_samples<=1)
throw <- sort(union(union(kontrols, kont.abs), union(low.abundance, plantae)))


#row.names(df2old.c)<- df2old.c$otu
df2old.c <- df2old.exp[-throw, ]

writeFast(df2old.c$sequence, df2old.c$otu, "otus_c.fasta")

#write a date stamped csv table wit OTUs and species and lineage
dnes <- format(Sys.time(), "%Y%m%d")
write.csv(df2old.c, paste("table_df2old_c_", dnes, ".csv", sep =""))


## Preparing communities based on various grouping criteria and taxonomic level


community2old.sample.abs <- t(df2old.c[,22:117])
community2old.sample <- prop.table(community2old.sample.abs, 1)
community2old.sample.obs <- community2old.sample.abs
community2old.sample.obs[community2old.sample.obs!=0] <- 1

region.sample <- substr(rownames(community2old.sample),1,1)
plant.sample <-  ifelse(nchar(rownames(community2old.sample))==3, substr(rownames(community2old.sample),1,2),substr(rownames(community2old.sample),1,3))
part <- ifelse(nchar(rownames(community2old.sample))==3, substr(rownames(community2old.sample),3,3),substr(rownames(community2old.sample),4,4))


# merged community2old - plant
community2old.abs <- matrix(nrow = nrow(community2old.sample.abs)/2, ncol = ncol(community2old.sample.abs))  
dimnames(community2old.abs) <- list(unique(plant.sample) ,dimnames(community2old.sample.abs)[[2]])
for(i in 1:48)
{
community2old.abs [i,] <- community2old.sample.abs[2*i-1,]+community2old.sample.abs[2*i,]
}


community2old <- as.matrix(prop.table(community2old.abs, 1))
community2old.obs <- community2old.abs
community2old.obs[community2old.obs!=0] <- 1


region <- substr(rownames(community2old),1,1)
# dvojice <-   substr(rownames(community2old),1,2)
plant <-  unique(plant.sample) 


#endup with community2old community2old.sample (both proportional and absolute)
 

#community2old analysis with species
# for samples and plants as islands respectively 


#prepare community2old matrix
community2old.spec.sample.abs <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$species), sum)[,-1])
colnames(community2old.spec.sample.abs) <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$species), sum)[,1])
community2old.spec.sample <- prop.table(community2old.spec.sample.abs, 1)
community2old.spec.sample.obs <- community2old.spec.sample.abs
community2old.spec.sample.obs[community2old.spec.sample.obs!=0] <- 1

# merged community2old - plant
community2old.spec.abs <- matrix(nrow = nrow(community2old.spec.sample.abs)/2, ncol = ncol(community2old.spec.sample.abs))  
dimnames(community2old.spec.abs) <- list(unique(plant.sample) ,dimnames(community2old.spec.sample.abs)[[2]])
for(i in 1:48)
{
community2old.spec.abs [i,] <- community2old.spec.sample.abs[2*i-1,]+community2old.spec.sample.abs[2*i,]
}


community2old.spec <- as.matrix(prop.table(community2old.spec.abs, 1))
community2old.spec.obs <- community2old.spec
community2old.spec.obs[community2old.spec.obs!=0] <- 1


#community2old analysis with genera
# for samples and plants as islands respectively 


#prepare community2old matrix
community2old.gen.sample.abs <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$genus), sum)[,-1])
colnames(community2old.gen.sample.abs) <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$genus), sum)[,1])
community2old.gen.sample <- prop.table(community2old.gen.sample.abs, 1)
community2old.gen.sample.obs <- community2old.gen.sample.abs
community2old.gen.sample.obs[community2old.gen.sample.obs!=0] <- 1

# merged community2old - plant
community2old.gen.abs <- matrix(nrow = nrow(community2old.gen.sample.abs)/2, ncol = ncol(community2old.gen.sample.abs))  
dimnames(community2old.gen.abs) <- list(unique(plant.sample) ,dimnames(community2old.gen.sample.abs)[[2]])
for(i in 1:48)
{
community2old.gen.abs [i,] <- community2old.gen.sample.abs[2*i-1,]+community2old.gen.sample.abs[2*i,]
}


community2old.gen <- as.matrix(prop.table(community2old.gen.abs, 1))


community2old.gen.obs <- community2old.gen
community2old.gen.obs[community2old.gen.obs!=0] <- 1


#community2old analysis with family
# for samples and plants as islands respectively 


#prepare community2old matrix
community2old.fam.sample.abs <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$family), sum)[,-1])
colnames(community2old.fam.sample.abs) <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$family), sum)[,1])
community2old.fam.sample <- prop.table(community2old.fam.sample.abs, 1)
community2old.fam.sample.obs <- community2old.fam.sample.abs
community2old.fam.sample.obs[community2old.fam.sample.obs!=0] <- 1

# merged community2old - plant
community2old.fam.abs <- matrix(nrow = nrow(community2old.fam.sample.abs)/2, ncol = ncol(community2old.fam.sample.abs))  
dimnames(community2old.fam.abs) <- list(unique(plant.sample) ,dimnames(community2old.fam.sample.abs)[[2]])
for(i in 1:48)
{
community2old.fam.abs [i,] <- community2old.fam.sample.abs[2*i-1,]+community2old.fam.sample.abs[2*i,]
}


community2old.fam <- as.matrix(prop.table(community2old.fam.abs, 1))


community2old.fam.obs <- community2old.fam
community2old.fam.obs[community2old.fam.obs!=0] <- 1


#community2old analysis with order 
# for samples and plants as islands respectively 


#prepare community2old matrix
community2old.order.sample.abs <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$order), sum)[,-1])
colnames(community2old.order.sample.abs) <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$order), sum)[,1])
community2old.order.sample <- prop.table(community2old.order.sample.abs, 1)
community2old.order.sample.obs <- community2old.order.sample.abs
community2old.order.sample.obs[community2old.order.sample.obs!=0] <- 1

# merged community2old - plant
community2old.order.abs <- matrix(nrow = nrow(community2old.order.sample.abs)/2, ncol = ncol(community2old.order.sample.abs))  
dimnames(community2old.order.abs) <- list(unique(plant.sample) ,dimnames(community2old.order.sample.abs)[[2]])
for(i in 1:48)
{
community2old.order.abs [i,] <- community2old.order.sample.abs[2*i-1,]+community2old.order.sample.abs[2*i,]
}


community2old.order <- as.matrix(prop.table(community2old.order.abs, 1))


community2old.order.obs <- community2old.order
community2old.order.obs[community2old.order.obs!=0] <- 1


#community2old analysis with class
# for samples and plants as islands respectively 


#prepare community2old matrix
community2old.class.sample.abs <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$class), sum)[,-1])
colnames(community2old.class.sample.abs) <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$class), sum)[,1])
community2old.class.sample <- prop.table(community2old.class.sample.abs, 1)
community2old.class.sample.obs <- community2old.class.sample.abs
community2old.class.sample.obs[community2old.class.sample.obs!=0] <- 1

# merged community2old - plant
community2old.class.abs <- matrix(nrow = nrow(community2old.class.sample.abs)/2, ncol = ncol(community2old.class.sample.abs))  
dimnames(community2old.class.abs) <- list(unique(plant.sample) ,dimnames(community2old.class.sample.abs)[[2]])
for(i in 1:48)
{
community2old.class.abs [i,] <- community2old.class.sample.abs[2*i-1,]+community2old.class.sample.abs[2*i,]
}


community2old.class <- as.matrix(prop.table(community2old.class.abs, 1))


community2old.class.obs <- community2old.class
community2old.class.obs[community2old.class.obs!=0] <- 1


#community2old analysis with phylum
# for samples and plants as islands respectively 


#prepare community2old matrix
community2old.phyl.sample.abs <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$phylum), sum)[,-1])
colnames(community2old.phyl.sample.abs) <- t(aggregate(x =df2old.c[,22:117], list(df2old.c$phylum), sum)[,1])
community2old.phyl.sample <- prop.table(community2old.phyl.sample.abs, 1)
community2old.phyl.sample.obs <- community2old.phyl.sample.abs
community2old.phyl.sample.obs[community2old.phyl.sample.obs!=0] <- 1

# merged community2old - plant
community2old.phyl.abs <- matrix(nrow = nrow(community2old.phyl.sample.abs)/2, ncol = ncol(community2old.phyl.sample.abs))  
dimnames(community2old.phyl.abs) <- list(unique(plant.sample) ,dimnames(community2old.phyl.sample.abs)[[2]])
for(i in 1:48)
{
community2old.phyl.abs [i,] <- community2old.phyl.sample.abs[2*i-1,]+community2old.phyl.sample.abs[2*i,]
}


community2old.phyl <- as.matrix(prop.table(community2old.phyl.abs, 1))


community2old.phyl.obs <- community2old.phyl
community2old.phyl.obs[community2old.phyl.obs!=0] <- 1


#################################################################
####################      ISOLATION 2015   ########################
###################################################################


#read in table with isolates, sequences and assigned species according to NCBI or UNITE
tabulka <- read.csv("table_isolation_15.csv", na.strings=c("", " ", NA))


# community15 analysis
# community15 analysis - species
# samples, parts and plants as islands respectively 
# absolute and observation tables generated (abs, obs)
# can do similarly for any taxonomic resolution (genus, family, order, class)


community15.spec.sample.abs <- as.data.frame.matrix(table(tabulka$fragment, tabulka$species))
riadok <- rowSums(community15.spec.sample.abs)
riadok[riadok == 0 ] <- 1
community15.spec.sample <- apply(community15.spec.sample.abs, 2, function(x) x/riadok)
community15.spec.sample.obs <- community15.spec.sample.abs
community15.spec.sample.obs[community15.spec.sample.obs!=0] <- 1


# merged community15 - part
community15.spec.part.abs <- as.data.frame.matrix(table(tabulka$part, tabulka$species))
riadok <- rowSums(community15.spec.part.abs)
riadok[riadok == 0 ] <- 1
community15.spec.part <- apply(community15.spec.part.abs, 2, function(x) x/riadok)
community15.spec.part.obs <- community15.spec.part.abs
community15.spec.part.obs[community15.spec.part.obs!=0] <- 1


# merged community15 - plant
community15.spec.abs <- as.data.frame.matrix(table(tabulka$plant, tabulka$species))
riadok <- rowSums(community15.spec.abs)
riadok[riadok == 0 ] <- 1
community15.spec <- apply(community15.spec.abs, 2, function(x) x/riadok)
community15.spec.obs <- community15.spec.abs
community15.spec.obs[community15.spec.obs!=0] <- 1


####################
## Comparing methods    
####################


# make dish identifier 
community <- community15.spec.sample.abs
well.sample.pos<- which(!substr(rownames(community), 6,6) %in% c("a", "b","c","d","e"))
RPD.sample.pos<- which(substr(rownames(community), 6,6) %in% c("a", "b","c","d","e"))
PDAwell.clean.sample.pos <- which(substr(rownames(community), 6,6) %in% c("2"))
PDAwell.sample.pos <- which(substr(rownames(community), 6,6) %in% c("2", "5"))
SAwell.sample.pos <- which(substr(rownames(community), 6,6) %in% c("1", "4"))
AMMwell.sample.pos <- which(substr(rownames(community), 6,6) %in% c("3", "6"))
PDA.sample.pos <- append(RPD.sample.pos, PDAwell.sample.pos)
dish.sample<- rep("6well", nrow(community15.spec.sample.abs))
dish.sample[RPD.sample.pos] <- "RPD" 

medium <- rep("PDA", nrow(community))
medium[SAwell.sample.pos] <- "SA"
medium[AMMwell.sample.pos] <- "AMM"
bengal <- rep("none", nrow(community))
bengal[which(substr(rownames(community), 6,6) %in% c("4","5", "6"))] <- "bengal" 


## species accumulation
png(filename="accum_Isolation2015_method.png",units="in", 
     width=10, 
     height=8, 
     pointsize=12, 
     res=72)

	
	plot(specaccum(community[well.sample.pos,], "rare"), ci.type = "polygon", ci.col=rgb(1,0,0, alpha = 0.1), col=rgb(1,0,0, alpha = 0.5), ci.lty =0, ,  xlab ="Fragments", ylab="Species", lty =3)
	plot(specaccum(community[RPD.sample.pos,], "rare"), ci.type = "polygon", col=rgb(0,0,1, alpha = 0.5), ci.col =rgb(0,0,1, alpha = 0.1), lty =3,  ci.lty =0,add =T)
	plot(specaccum(community[well.sample.pos,], "coll"), col=rgb(1,0,0), add =T)
	plot(specaccum(community[RPD.sample.pos,], "coll"), col=rgb(0,0,1), add =T)
	legend( "topleft",  c("Multiwell plate with different media","Petri dish with single medium", " ", " "), col=c("red", "blue" ,"red", "blue"), 		title=c("Observed        	 Randomized"), title.adj=0.03, lty=c(1,1,2,2), ncol=2, cex =1.2, bty="n")

dev.off()


#Chao estimator of community richness
specpool(community)$chao


## nmds 

#####whole community
agregat <- aggregate(. ~substr(rownames(community),1,3)+dish.sample, data = community, sum)

rownames(agregat) <- paste(agregat[,1], agregat[,2], sep =" ")
x<- 2
agr.num <- c(1:x)
agregat.n <- agregat[, -agr.num]
mds1 <- metaMDS(agregat.n, trymax =100000, k=2, dist ="jaccard")


png(filename="NMDS_Isolation2015_method.png",units="in", 
     width=10, 
     height=8, 
     pointsize=12, 
     res=72)

	plot(mds1 , type ="n")
	points(mds1$points, pch = c(15, 16)[as.factor(agregat[,2])], col=c( "red" ,"blue")[as.factor(agregat[,2])], cex = 2)
	ordihull(mds1$points, as.factor(agregat[,2]))
	legend( "topleft",  c("Multiwell plate with multiple media","Petri dish with single medium"), pch=c(15,16), col=c( "red" ,"blue"), cex =1.2, bty="n")
	legend( "bottomright",  paste("stress = ", mds1$stress,  sep =""), bty="n")

dev.off()


adonis(agregat.n~substr(agregat[,1], 1,1)*agregat[,2])


######## multiwell
#community[well.sample.pos,], sum)
agregat <- aggregate(. ~substr(rownames(community[well.sample.pos,]),1,3)+medium[well.sample.pos]+bengal[well.sample.pos], data =community[well.sample.pos,], sum)
rownames(agregat) <- paste(agregat[,1], agregat[,2], agregat[,3], sep =" ")

agregat <- agregat[which(rowSums(agregat[,-agr.num])!=0),]

x<-3
agr.num <- c(1:x)

agregat.n <- agregat[, -agr.num]

agregat.n <- agregat.n[, which(colSums(agregat.n)!=0)]
# dispose 213 SA to be able to plot meaningful distances
agregat.x <- agregat.n[-27,]

mds1 <- metaMDS(agregat.x, trymax =100000, k=2, dist ="jaccard")

png(filename="NMDS_Isolation2015_multimedia.png",units="in", 
     width=10, 
     height=8, 
     pointsize=12, 
     res=72)

	
plot(mds1 , type ="n")
	points(mds1$points+0.005, pch = c(15, 16,17)[as.factor(agregat[-27,2])], col=c( "red" ,"orange")[as.factor(agregat[-27,3])], cex = 2)
	ordihull(mds1$points+0.005, as.factor(agregat[-27,2]))
	legend( "topleft",  c("	Aspergillus minimal mediun","	Potato dextrose agar", "	Sabordough agar", " ", " ", " "), pch=c(15:17,15:17), col=rep(c("red" ,"orange"),each=3), title=c("Rose bengal           	   no Rose bengal"), title.adj=0.03, ncol=2, cex =1.2, bty="n")
	legend( "bottomright",  paste("stress = ", mds1$stress,  sep =""), bty="n")


dev.off()


########## VENN DIAGRAM

library(VennDiagram)


#community <- aggregate(.~ dish.sample +  substr(rownames(community),1,4), data = community, sum)[,-c(1,2)]

Tissue <- factor(rep(rep(c("Leaves", "Roots", "Stems"),each =2, 10)),
		levels =c("Leaves", "Stems", "Roots"))

Field <- factor(rep(c("Conventional", "Organic"), each =30),
		levels =c("Conventional", "Organic"))
dish.sample<- factor(rep(c("6well","RPD"), 30),
		levels =c("6well","RPD"))


overlap<- calculate.overlap(x=list(colnames(community[,(colSums(community[RPD.sample.pos,])!=0)]),colnames(community[,(colSums(community[well.sample.pos,])!=0)])))
# draw venn from overlap


png(filename="Venn_Isolation2015_method.png",units="in", 
     width=6, 
     height=6, 
     pointsize=12, 
     res=72)

#dev.off()

draw.pairwise.venn(length(overlap$a1),
length(overlap$a2),
length(overlap$a3),
scaled =F, 
fill = c("blue", "red"),
alpha =0.4,
cex =4
)

dev.off()


#Isolation in fields
	v1 <- venn.diagram(
		list( "Conventional"= colnames(community15.spec)[colSums(community15.spec[1:5,])!=0], 
		"Organic" =colnames(community15.spec)[colSums(community15.spec[6:10,])!=0]),
		filename =NULL, 
		# cex.prop= log10,
		cex=4, cat.cex =2,
		cat.pos =c(330, 150),
		cat.dist =0.05)

	png(filename="Venn_Field_Isolation_2015.png",
		units="in", 
   		width=6, 
    		height=6, 
    		pointsize=12, 
     		res=300)

		grid.newpage()
		grid.draw(v1)

	dev.off()

#Isolation in tissue
	Tissue <- rep(c("Leaves","Roots","Stems"), 10)
	v1 <- venn.diagram(
		list( Leaves= colnames(community15.spec.part.abs)[colSums(community15.spec.part.abs[Tissue=="Leaves",])!=0], 
		Stems =colnames(community15.spec.part.abs)[colSums(community15.spec.part.abs[Tissue=="Stems",])!=0],
		Roots =colnames(community15.spec.part.abs)[colSums(community15.spec.part.abs[Tissue=="Roots",])!=0]),
		filename =NULL, 
		fill=c("green",  "yellowgreen", "saddlebrown"), 
		alpha =0.3,
		# cex.prop= log10,
		 cex=4,
		cat.cex =2)

	png(filename="Venn_Tissue_Isolation_2015.png",
		units="in", 
   		width=6, 
    		height=6, 
    		pointsize=12, 
     		res=300)

		grid.newpage()
		grid.draw(v1)

	dev.off()


## rarefaction with anova comparing 1:450 subsets of both fragments 1000 times and extracting the p values for later comparison
pival <- c()
for(j in 1:1000)
{
meth <- c()
rpdspecnum <- c()
wellspecnum <- c()
specnum <- c()


for(i in 1:length(RPD.sample.pos))
	{
	rpdspecnum <- append(rpdspecnum,  sum(colSums(community[sample(RPD.sample.pos,i),])!=0))
	wellspecnum <- append(wellspecnum,  sum(colSums(community[sample(well.sample.pos,i),])!=0))
	specnum <- append(specnum, c(sum(colSums(community[sample(RPD.sample.pos,i),])!=0),  sum(colSums(community[sample(well.sample.pos,i),])!=0)))
	meth <- append(meth, c("PDAwell", "RPD"))
	}

pival <-  append(pival, summary(aov(specnum~meth))[[1]]$"Pr(>F)"[1])

}

hist(pival)


## rankabundances of species in sampled plants

community <- community15.spec.sample.abs[,order(colSums(community15.spec.abs))]

ilol <-  data.frame(	Species = factor(rep(colnames(community),2), levels = colnames(community) ),
		 	Count = c( colSums(community[RPD.sample.pos,]), 
				colSums(community[well.sample.pos,])), 
			Method = rep(c("Single medium", "Multiple media"), each =length(colnames(community))))


g0 <- ggplot(ilol, aes(x=Species, y=Count, fill = Method))
		g0 + 	geom_bar(stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + coord_flip() 

ggsave(filename="ranked_abundance_stacked_Isolation_2015.png",units="in", 
		width=8, 
		height=16)  


## null model Venn Diagram example 
community <- community15.spec.sample.obs
community <- aggregate(.~ dish.sample +  substr(rownames(community),1,4), data = community, sum)[,-c(1,2)]

Tissue <- factor(rep(rep(c("Leaves", "Roots", "Stems"),each =2, 10)),
		levels =c("Leaves", "Stems", "Roots"))

Field <- factor(rep(c("Conventional", "Organic"), each =30),
		levels =c("Conventional", "Organic"))
dish.sample<- factor(rep(c("6well","RPD"), 30),
		levels =c("6well","RPD"))

# random community permutations 
randomCom <- permatfull(community>0, fixedmar="both", mtype="prab",  burnin =270,  times=9999) 


# functions to extract number of species by cultivation method

meth2F <- function(x) sum(colSums(x[dish.sample=="RPD",]>0)>0 & colSums(x[dish.sample!="RPD",]>0)==0)
meth1F <- function(x) sum(colSums(x[dish.sample=="6well",]>0)>0 & colSums(x[dish.sample!="6well",]>0)==0)
bothF <- function(x) sum(colSums(x[dish.sample=="RPD",]>0)>0 & colSums(x[dish.sample=="6well",]>0)>0)


# lee and sti as intermediate variables with means and confidence interval values of expected species count

funlist <- list(meth1F, bothF )
lee<- sapply(funlist, function (x)  {	result <- unlist(lapply(randomCom$perm,	x))
				paste( sort(result)[floor(0.01*length(result))], " - ", round(median(result),1), " - ",sort(result)[ceiling(0.99*length(result))], sep ="")
				print(paste(sort(result)[floor(0.01*length(result))], " - ", round(mean(result)), " - ",sort(result)[ceiling(0.99*length(result))], sep =""))
})


funlist <-  list(meth2F, bothF)
sti <- sapply(funlist, function (x)  {	result <- unlist(lapply(randomCom$perm,	x))
				paste(sort(result)[floor(0.01*length(result))], " - ", round(median(result),1), " - ",sort(result)[ceiling(0.99*length(result))], sep ="")
				print(paste( sort(result)[floor(0.01*length(result))], " - ", round(mean(result)), " - ",sort(result)[ceiling(0.99*length(result))], sep =""))
})

#create Venn Diagram stored in v1 with labels extracted from lee and sti variables
v1 <- draw.pairwise.venn(length(overlap$a1),
length(overlap$a2),
length(overlap$a3),
scaled =F, 
fill = c("blue", "red"),
alpha =0.4,
cex =2
)
v1[[5]]$label <- lee[1]
v1[[6]]$label <- sti[1]
v1[[7]]$label <- lee[2]

# export venn diagram to png file 

png(filename="Venn_Isolation2015_method_exp.png",
		units="in", 
   		width=6, 
    		height=6, 
    		pointsize=12, 
     		res=300)

		grid.newpage()
		grid.draw(v1)

	dev.off()