# Clear memory of characters:
ls()
## character(0)
rm(list=ls())

#Set Working Directory: 
setwd("~/CompBio")

# get programs
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# read in data:
plantTrans <- read.csv("plantTransPlantsCompBio.csv", header=TRUE, sep = ",", stringsAsFactors=FALSE)

BombSurvey <- read.csv("2015SurveyCompBio.csv", header=TRUE, sep = ",", stringsAsFactors=FALSE)

head(BombSurvey)
##   X target_name      site HBviralLoad  n BombusViralLoad         sd
## 1 1        BQCV BOST-NEAR   366994727 21      5316210.40  9459693.7
## 2 2        BQCV      CIND    14771905 10     28455499.25 56110898.3
## 3 3        BQCV     CLERK           0 21        62970.41   146142.2
## 4 4        BQCV       COL   992531887 23      1372201.88  1964750.5
## 5 5        BQCV      FERL   419532555 20     15633038.03 28317436.2
## 6 6        BQCV      FLAN           0 10     12662784.20 13246489.6
##            se apiary_near_far ShannonDIV Density sumApiaries sumColonies
## 1  2064274.40               1  1.6187671    40.8           9          45
## 2 17743824.00               0  0.0000000   132.7           1           1
## 3    31890.84               0  0.5948803    27.3           0           0
## 4   409678.81               0  0.6917615     3.8           3           3
## 5  6331971.24               1  0.7977888    26.5           2          27
## 6  4188907.81               0  0.1683164    67.5           4          10
##    BombPrev BombMeanNA
## 1 0.9523810  5582020.9
## 2 1.0000000 28455499.3
## 3 0.2380952   264475.7
## 4 0.8695652  1578032.2
## 5 0.8000000 19541297.5
## 6 1.0000000 12662784.2
#adding a column that will be the log transformation of VL to fix non-normality
BombSurvey$VL <- log(BombSurvey$BombusViralLoad)
#Make all '-Inf' into zeros
BombSurvey$VL[BombSurvey$VL == "-Inf"] <- 0
#check distribution
hist(BombSurvey$VL)

Linear Regression:

#########LINEAR REGRESSION#######################################

# FUNCTION: fitLinear
# inputs: numeric vector of predictor (x) and response (y)
# outputs: slope and p-value
# data

#Splitting the dataframe based on virus target:
DWV <- subset(BombSurvey, target_name=="DWV")
BQCV <- subset(BombSurvey, target_name=="BQCV")


#Creating a two column dataframe for the linear regression
names(DWV)
##  [1] "X"               "target_name"     "site"           
##  [4] "HBviralLoad"     "n"               "BombusViralLoad"
##  [7] "sd"              "se"              "apiary_near_far"
## [10] "ShannonDIV"      "Density"         "sumApiaries"    
## [13] "sumColonies"     "BombPrev"        "BombMeanNA"     
## [16] "VL"
BQReg<-cbind(BQCV$sumApiaries, BQCV$BombPrev)
DWReg <- cbind(DWV$sumApiaries, DWV$BombPrev)

#Get rid of zeros, (STILL WORKING ON THIS)

#creating a two column dataframe for the ANOVA
BQAnov <- cbind(BQCV$VL,BQCV$apiary_near_far)
DWAnov <- cbind(DWV$VL,DWV$apiary_near_far)

fitLinear <- function(data=data.frame(runif(10),runif(10))) {
  x = data[,1]
  y = data[,2]
  myMod <- lm(y~x) # fit the model
  myOut <- c(slope=summary(myMod)$coefficients[2,1],
             pValue=summary(myMod)$coefficients[2,4])
  return(myOut)
}
fitLinear()
##    slope   pValue 
## 0.101887 0.720334
#Testing the function on my dataset:

#Using the linear regression function:
fitLinear(BQReg)
##       slope      pValue 
## 0.065235870 0.008274574
fitLinear(DWReg)
##         slope        pValue 
## -0.0003124868  0.9797618055
#Create output plot
plotfitLinear <- function(data=data.frame(runif(10),runif(10))) {
  x = data[,1]
  y = data[,2]
  plot1 <- plot(x=x,y=y,pch=21,bg="lightblue",cex=2)
  myMod <- lm(y~x)
  abline(myMod)
  return(plot1)
}
plotfitLinear()

## NULL
#Plotting my data

plotfitLinear(BQReg)

## NULL
plotfitLinear(DWReg)

## NULL

ANOVA

#########ANOVA#########################################

# FUNCTION: ANOVA
# inputs: categorical vector predictor (x) numeric vector of  response (y)
# outputs: p-value
# data

anovafunc <- function(data=data.frame(runif(10),rep(c("birds","poodles"),each=5))) {
  x = data[,2]
  y = data[,1]
  myAnova <- aov(y~x)
  AnovaOut1 <- summary(myAnova)
  AnovaOut2 <- print(myAnova)
  myOut <- list(AnovaOut1,AnovaOut2)
  return(myOut)
}
anovafunc()
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                         x Residuals
## Sum of Squares  0.0443390 0.6516589
## Deg. of Freedom         1         8
## 
## Residual standard error: 0.2854074
## Estimated effects may be unbalanced
## [[1]]
##             Df Sum Sq Mean Sq F value Pr(>F)
## x            1 0.0443 0.04434   0.544  0.482
## Residuals    8 0.6517 0.08146               
## 
## [[2]]
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                         x Residuals
## Sum of Squares  0.0443390 0.6516589
## Deg. of Freedom         1         8
## 
## Residual standard error: 0.2854074
## Estimated effects may be unbalanced
#Checking function with my data
anovafunc(BQAnov)
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                        x Residuals
## Sum of Squares   5.18842  69.30858
## Deg. of Freedom        1        17
## 
## Residual standard error: 2.019152
## Estimated effects may be unbalanced
## [[1]]
##             Df Sum Sq Mean Sq F value Pr(>F)
## x            1   5.19   5.188   1.273  0.275
## Residuals   17  69.31   4.077               
## 
## [[2]]
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                        x Residuals
## Sum of Squares   5.18842  69.30858
## Deg. of Freedom        1        17
## 
## Residual standard error: 2.019152
## Estimated effects may be unbalanced
anovafunc(DWAnov)
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                        x Residuals
## Sum of Squares   68.1897  356.8392
## Deg. of Freedom        1        17
## 
## Residual standard error: 4.581543
## Estimated effects may be unbalanced
## [[1]]
##             Df Sum Sq Mean Sq F value Pr(>F)  
## x            1   68.2   68.19   3.249 0.0892 .
## Residuals   17  356.8   20.99                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[2]]
## Call:
##    aov(formula = y ~ x)
## 
## Terms:
##                        x Residuals
## Sum of Squares   68.1897  356.8392
## Deg. of Freedom        1        17
## 
## Residual standard error: 4.581543
## Estimated effects may be unbalanced
#Function for Anova output:
myboxplot <- function(data=data.frame(runif(10),rep(c("birds","poodles"),each=5))) {
  x = data[,2]
  y = data[,1]
  box <- boxplot(y~x)
  return(box)
}

myboxplot()

## $stats
##           [,1]       [,2]
## [1,] 0.3123433 0.02177750
## [2,] 0.4112163 0.09241337
## [3,] 0.5305865 0.64407506
## [4,] 0.5791060 0.76242163
## [5,] 0.6579374 0.79719163
## 
## $n
## [1] 5 5
## 
## $conf
##           [,1]      [,2]
## [1,] 0.4119561 0.1706489
## [2,] 0.6492170 1.1175012
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] "birds"   "poodles"
#Plotting my data
myboxplot(BQAnov)

## $stats
##          [,1]     [,2]
## [1,] 10.17631 12.82901
## [2,] 12.67244 14.19091
## [3,] 14.38769 15.48627
## [4,] 15.93283 16.03233
## [5,] 17.16385 17.70910
## 
## $n
## [1] 12  7
## 
## $conf
##          [,1]     [,2]
## [1,] 12.90060 14.38660
## [2,] 15.87478 16.58594
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] "0" "1"
myboxplot(DWAnov)

## $stats
##           [,1]      [,2]
## [1,]  0.000000  5.849028
## [2,]  0.000000  7.180848
## [3,]  3.585786  8.523187
## [4,]  8.884197 11.124129
## [5,] 10.512802 13.208084
## 
## $n
## [1] 12  7
## 
## $conf
##            [,1]      [,2]
## [1,] -0.4663559  6.168323
## [2,]  7.6379276 10.878051
## 
## $out
## [1] 0
## 
## $group
## [1] 2
## 
## $names
## [1] "0" "1"

Contingency Table:

#############################################
#Data setup for contingency table 
#HBplants <- subset(plantTrans, group=="T")
#HBplants <- ddply(HBplants, c("spp"), summarise, 
 #                 n = length(BINYprefilter),
  #                mean = mean(BINYprefilter, na.rm=TRUE),
   #               sum = sum(BINYprefilter, na.rm = TRUE),
    #              sd = sd(BINYprefilter, na.rm=TRUE),
     #             se = sd / sqrt(n))

#Checking function with my data
#anovafunc(HBplants$spp,HBplants$mean)

Creating a Random Dataset

#Creating Random Data Set