Selection of optimal parameters¶
Coral¶
WGCNA: Define thresholding power
Library
library(WGCNA)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /yshare1/home/toru/anaconda3/envs/wgcna/lib/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=ja_JP.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=ja_JP.UTF-8 LC_COLLATE=ja_JP.UTF-8
## [5] LC_MONETARY=ja_JP.UTF-8 LC_MESSAGES=ja_JP.UTF-8
## [7] LC_PAPER=ja_JP.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=ja_JP.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] WGCNA_1.69 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [4] rmarkdown_1.12
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 lattice_0.20-38 GO.db_3.10.0
## [4] png_0.1-7 digest_0.6.18 foreach_1.5.0
## [7] R6_2.4.1 backports_1.1.7 acepack_1.4.1
## [10] stats4_3.6.1 RSQLite_2.2.0 evaluate_0.13
## [13] ggplot2_3.3.1 pillar_1.4.4 rlang_0.4.6
## [16] data.table_1.12.8 rstudioapi_0.11 blob_1.2.1
## [19] S4Vectors_0.24.4 rpart_4.1-15 Matrix_1.2-17
## [22] preprocessCore_1.48.0 checkmate_2.0.0 splines_3.6.1
## [25] stringr_1.4.0 foreign_0.8-71 htmlwidgets_1.5.1
## [28] bit_1.1-15.2 munsell_0.5.0 compiler_3.6.1
## [31] xfun_0.6 pkgconfig_2.0.3 BiocGenerics_0.32.0
## [34] base64enc_0.1-3 htmltools_0.3.6 nnet_7.3-12
## [37] tibble_3.0.1 gridExtra_2.3 htmlTable_1.13.3
## [40] Hmisc_4.4-0 IRanges_2.20.2 codetools_0.2-16
## [43] matrixStats_0.56.0 crayon_1.3.4 grid_3.6.1
## [46] gtable_0.3.0 lifecycle_0.2.0 DBI_1.1.0
## [49] magrittr_1.5 scales_1.1.1 impute_1.60.0
## [52] stringi_1.4.3 doParallel_1.0.15 latticeExtra_0.6-29
## [55] ellipsis_0.3.1 vctrs_0.3.0 Formula_1.2-3
## [58] RColorBrewer_1.1-2 iterators_1.0.12 tools_3.6.1
## [61] bit64_0.9-7 Biobase_2.46.0 glue_1.3.1
## [64] jpeg_0.1-8.1 parallel_3.6.1 survival_3.1-12
## [67] yaml_2.2.0 AnnotationDbi_1.48.0 colorspace_1.4-1
## [70] cluster_2.0.8 memoise_1.1.0 knitr_1.22
Load files
# matFPKM.c : Gene expression table (FPKM level) of corals
# matTPM.c : Gene expression table (TPM level) of corals
# meta.c : Metadata of coral samples
load("Expression_Coral.UT.RData")
Preprocessing
Remove low-expressed genes
以下のページを参考に低発現遺伝子を除外する。ここではFPKM>1で5割のサンプルから検出されている遺伝子を選んだ。
Reference: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/faq.html
4. Can WGCNA be used to analyze RNA-Seq data?
We suggest removing features whose counts are consistently low (for example, removing all features that have a count of less than say 10 in more than 90% of the samples) because such low-expressed features tend to reflect noise and correlations based on counts that are mostly zero aren't really meaningful.
n_detected = rowSums(matFPKM.c > 1)
detected = (n_detected / ncol(matFPKM.c) > 0.5)
table(detected)
## detected
## FALSE TRUE
## 23321 16699
mat.c = matFPKM.c[detected,] # choose mostly-detected genes
sum(apply(mat.c, 1, var) == 0) # check whether zero-variance gene is found
## [1] 0
Log2-transformation
mat.c = log2(mat.c + 1)
mat.c = t(mat.c) # transpose
WGCNA: Choose adequate soft-thresholding power
Reference: Step-by-step network construction and module detection
https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-02-networkConstr-man.pdf
powers = 1:20
sft.c = pickSoftThreshold(mat.c,
powerVector = powers,
verbose = 5,
networkType = "signed") # Signed network analysis
## pickSoftThreshold: will use block size 2679.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 2679 of 16699
## Warning: executing %dopar% sequentially: no parallel backend registered
## ..working on genes 2680 through 5358 of 16699
## ..working on genes 5359 through 8037 of 16699
## ..working on genes 8038 through 10716 of 16699
## ..working on genes 10717 through 13395 of 16699
## ..working on genes 13396 through 16074 of 16699
## ..working on genes 16075 through 16699 of 16699
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.63600 6.140 0.933 8780.0 8830.00 10100
## 2 2 0.00754 0.254 0.907 4920.0 4870.00 6650
## 3 3 0.36700 -1.600 0.862 2900.0 2780.00 4710
## 4 4 0.54000 -1.930 0.850 1790.0 1640.00 3490
## 5 5 0.64900 -2.110 0.862 1150.0 994.00 2660
## 6 6 0.72600 -2.180 0.890 758.0 618.00 2090
## 7 7 0.73700 -2.240 0.887 516.0 399.00 1670
## 8 8 0.75800 -2.220 0.900 360.0 264.00 1360
## 9 9 0.76500 -2.210 0.906 257.0 179.00 1120
## 10 10 0.77400 -2.210 0.914 187.0 124.00 930
## 11 11 0.79600 -2.180 0.931 139.0 86.60 781
## 12 12 0.79200 -2.190 0.932 105.0 61.50 661
## 13 13 0.79800 -2.180 0.940 79.9 44.30 564
## 14 14 0.81000 -2.160 0.950 61.8 32.20 484
## 15 15 0.81700 -2.140 0.957 48.4 23.70 417
## 16 16 0.82500 -2.130 0.964 38.3 17.60 362
## 17 17 0.82900 -2.110 0.968 30.6 13.20 315
## 18 18 0.82800 -2.100 0.967 24.6 10.00 275
## 19 19 0.83300 -2.080 0.970 20.0 7.65 241
## 20 20 0.83600 -2.060 0.973 16.4 5.88 212
sft.c
## $powerEstimate
## [1] NA
##
## $fitIndices
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k.
## 1 1 0.636015761 6.1394748 0.9334311 8775.05249 8831.283111
## 2 2 0.007537047 0.2535258 0.9069594 4915.12183 4872.381459
## 3 3 0.367034640 -1.5954669 0.8619967 2900.49641 2782.560107
## 4 4 0.539533308 -1.9299624 0.8503500 1788.50077 1639.883356
## 5 5 0.649046240 -2.1088255 0.8622259 1145.18619 993.997367
## 6 6 0.725569395 -2.1803677 0.8897642 757.66787 617.571823
## 7 7 0.737363653 -2.2369381 0.8873196 515.85393 398.523326
## 8 8 0.758149710 -2.2194124 0.8996561 360.19136 264.274470
## 9 9 0.764659329 -2.2141017 0.9055804 257.17822 179.293912
## 10 10 0.773828696 -2.2096717 0.9138145 187.30323 123.753766
## 11 11 0.795795432 -2.1807600 0.9306162 138.84565 86.618236
## 12 12 0.792022423 -2.1922162 0.9322456 104.56527 61.483275
## 13 13 0.798088499 -2.1778465 0.9399064 79.87501 44.279191
## 14 14 0.809760349 -2.1570168 0.9500571 61.80090 32.246538
## 15 15 0.816984246 -2.1439052 0.9573274 48.37382 23.720143
## 16 16 0.824704730 -2.1282369 0.9638006 38.26456 17.609821
## 17 17 0.828549255 -2.1123552 0.9678029 30.55985 13.201554
## 18 18 0.827928825 -2.1004325 0.9671891 24.62187 10.005280
## 19 19 0.833130591 -2.0810743 0.9702946 19.99846 7.652881
## 20 20 0.836086602 -2.0611295 0.9725581 16.36459 5.881468
## max.k.
## 1 10079.1895
## 2 6652.2951
## 3 4710.6018
## 4 3485.4486
## 5 2664.3989
## 6 2088.5624
## 7 1669.9980
## 8 1356.8687
## 9 1117.0422
## 10 929.7508
## 11 781.0868
## 12 661.4486
## 13 564.0304
## 14 483.8962
## 15 417.3943
## 16 361.7750
## 17 314.9358
## 18 275.2474
## 19 241.4312
## 20 212.4739
save(mat.c, sft.c, file="2.1.c_WGCNA_ThresholdingPower.UT.RData")
Zooxanthellae¶
WGCNA: Define thresholding power
Library
library(WGCNA)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /yshare1/home/toru/anaconda3/envs/wgcna/lib/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=ja_JP.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=ja_JP.UTF-8 LC_COLLATE=ja_JP.UTF-8
## [5] LC_MONETARY=ja_JP.UTF-8 LC_MESSAGES=ja_JP.UTF-8
## [7] LC_PAPER=ja_JP.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=ja_JP.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] WGCNA_1.69 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [4] rmarkdown_1.12
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 lattice_0.20-38 GO.db_3.10.0
## [4] png_0.1-7 digest_0.6.18 foreach_1.5.0
## [7] R6_2.4.1 backports_1.1.7 acepack_1.4.1
## [10] stats4_3.6.1 RSQLite_2.2.0 evaluate_0.13
## [13] ggplot2_3.3.1 pillar_1.4.4 rlang_0.4.6
## [16] data.table_1.12.8 rstudioapi_0.11 blob_1.2.1
## [19] S4Vectors_0.24.4 rpart_4.1-15 Matrix_1.2-17
## [22] preprocessCore_1.48.0 checkmate_2.0.0 splines_3.6.1
## [25] stringr_1.4.0 foreign_0.8-71 htmlwidgets_1.5.1
## [28] bit_1.1-15.2 munsell_0.5.0 compiler_3.6.1
## [31] xfun_0.6 pkgconfig_2.0.3 BiocGenerics_0.32.0
## [34] base64enc_0.1-3 htmltools_0.3.6 nnet_7.3-12
## [37] tibble_3.0.1 gridExtra_2.3 htmlTable_1.13.3
## [40] Hmisc_4.4-0 IRanges_2.20.2 codetools_0.2-16
## [43] matrixStats_0.56.0 crayon_1.3.4 grid_3.6.1
## [46] gtable_0.3.0 lifecycle_0.2.0 DBI_1.1.0
## [49] magrittr_1.5 scales_1.1.1 impute_1.60.0
## [52] stringi_1.4.3 doParallel_1.0.15 latticeExtra_0.6-29
## [55] ellipsis_0.3.1 vctrs_0.3.0 Formula_1.2-3
## [58] RColorBrewer_1.1-2 iterators_1.0.12 tools_3.6.1
## [61] bit64_0.9-7 Biobase_2.46.0 glue_1.3.1
## [64] jpeg_0.1-8.1 parallel_3.6.1 survival_3.1-12
## [67] yaml_2.2.0 AnnotationDbi_1.48.0 colorspace_1.4-1
## [70] cluster_2.0.8 memoise_1.1.0 knitr_1.22
Load files
# matFPKM.z : Gene expression table (FPKM level) of zooxanthellae
# matTPM.z : Gene expression table (TPM level) of zooxanthellae
load("Expression.UT.RData")
Preprocessing
Remove low-expressed genes
以下のページを参考に低発現遺伝子を除外する。ここではFPKM>1で5割のサンプルから検出されている遺伝子を選んだ。
Reference: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/faq.html
4. Can WGCNA be used to analyze RNA-Seq data?
We suggest removing features whose counts are consistently low (for example, removing all features that have a count of less than say 10 in more than 90% of the samples) because such low-expressed features tend to reflect noise and correlations based on counts that are mostly zero aren't really meaningful.
n_detected = rowSums(matFPKM.z > 1)
detected = (n_detected / ncol(matFPKM.z) > 0.5)
table(detected)
## detected
## FALSE TRUE
## 6320 39284
mat.z = matFPKM.z[detected,] # choose mostly-detected genes
sum(apply(mat.z, 1, var) == 0) # check whether zero-variance gene is found
## [1] 0
Log2-transformation
mat.z = log2(mat.z + 1)
mat.z = t(mat.z) # transpose
WGCNA: Choose adequate soft-thresholding power
Reference: Step-by-step network construction and module detection
https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-02-networkConstr-man.pdf
powers = 1:30
sft.z = pickSoftThreshold(mat.z,
powerVector = powers,
verbose = 5,
networkType = "signed") # Signed network analysis
## pickSoftThreshold: will use block size 1138.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1138 of 39284
## Warning: executing %dopar% sequentially: no parallel backend registered
## ..working on genes 1139 through 2276 of 39284
## ..working on genes 2277 through 3414 of 39284
## ..working on genes 3415 through 4552 of 39284
## ..working on genes 4553 through 5690 of 39284
## ..working on genes 5691 through 6828 of 39284
## ..working on genes 6829 through 7966 of 39284
## ..working on genes 7967 through 9104 of 39284
## ..working on genes 9105 through 10242 of 39284
## ..working on genes 10243 through 11380 of 39284
## ..working on genes 11381 through 12518 of 39284
## ..working on genes 12519 through 13656 of 39284
## ..working on genes 13657 through 14794 of 39284
## ..working on genes 14795 through 15932 of 39284
## ..working on genes 15933 through 17070 of 39284
## ..working on genes 17071 through 18208 of 39284
## ..working on genes 18209 through 19346 of 39284
## ..working on genes 19347 through 20484 of 39284
## ..working on genes 20485 through 21622 of 39284
## ..working on genes 21623 through 22760 of 39284
## ..working on genes 22761 through 23898 of 39284
## ..working on genes 23899 through 25036 of 39284
## ..working on genes 25037 through 26174 of 39284
## ..working on genes 26175 through 27312 of 39284
## ..working on genes 27313 through 28450 of 39284
## ..working on genes 28451 through 29588 of 39284
## ..working on genes 29589 through 30726 of 39284
## ..working on genes 30727 through 31864 of 39284
## ..working on genes 31865 through 33002 of 39284
## ..working on genes 33003 through 34140 of 39284
## ..working on genes 34141 through 35278 of 39284
## ..working on genes 35279 through 36416 of 39284
## ..working on genes 36417 through 37554 of 39284
## ..working on genes 37555 through 38692 of 39284
## ..working on genes 38693 through 39284 of 39284
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.95000 4.3100 0.953 24500.0 25400.0 29000
## 2 2 0.82800 1.6400 0.783 16200.0 17100.0 22600
## 3 3 0.40300 0.6380 0.432 11200.0 11700.0 18100
## 4 4 0.00667 0.0669 0.248 7940.0 8230.0 14700
## 5 5 0.10800 -0.2940 0.440 5780.0 5850.0 12200
## 6 6 0.29500 -0.5630 0.607 4300.0 4220.0 10200
## 7 7 0.41600 -0.7530 0.701 3250.0 3070.0 8640
## 8 8 0.49900 -0.9040 0.765 2500.0 2250.0 7380
## 9 9 0.55800 -1.0200 0.810 1950.0 1670.0 6350
## 10 10 0.60700 -1.1100 0.844 1530.0 1250.0 5510
## 11 11 0.63900 -1.2100 0.864 1220.0 939.0 4800
## 12 12 0.66500 -1.2800 0.881 981.0 711.0 4210
## 13 13 0.68000 -1.3500 0.891 795.0 541.0 3700
## 14 14 0.70100 -1.4000 0.906 649.0 415.0 3270
## 15 15 0.71400 -1.4500 0.913 534.0 320.0 2900
## 16 16 0.72500 -1.5000 0.921 442.0 248.0 2590
## 17 17 0.74000 -1.5200 0.931 368.0 193.0 2310
## 18 18 0.75200 -1.5500 0.939 309.0 151.0 2070
## 19 19 0.76800 -1.5700 0.948 260.0 118.0 1860
## 20 20 0.77700 -1.5900 0.953 220.0 93.3 1670
## 21 21 0.78500 -1.6100 0.958 187.0 74.0 1510
## 22 22 0.79300 -1.6200 0.963 159.0 58.9 1370
## 23 23 0.80000 -1.6400 0.967 137.0 47.1 1240
## 24 24 0.80500 -1.6600 0.969 117.0 37.9 1130
## 25 25 0.81400 -1.6600 0.974 101.0 30.5 1030
## 26 26 0.82200 -1.6700 0.976 87.7 24.6 937
## 27 27 0.82900 -1.6800 0.979 76.1 20.0 856
## 28 28 0.83400 -1.6900 0.980 66.3 16.3 784
## 29 29 0.83700 -1.6900 0.983 57.9 13.3 718
## 30 30 0.84000 -1.7000 0.984 50.7 10.9 661
sft.z
## $powerEstimate
## [1] 1
##
## $fitIndices
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k.
## 1 1 0.950355436 4.31405293 0.9531192 24503.85086 25422.30150
## 2 2 0.827543211 1.63718305 0.7834519 16205.78944 17054.48510
## 3 3 0.403033472 0.63808513 0.4322007 11165.88150 11735.93827
## 4 4 0.006671372 0.06686317 0.2481058 7935.96869 8228.70388
## 5 5 0.108426753 -0.29352353 0.4397809 5782.13257 5851.77908
## 6 6 0.294731172 -0.56257169 0.6068116 4300.39997 4217.68996
## 7 7 0.415813355 -0.75250993 0.7011965 3254.73272 3067.39583
## 8 8 0.499315567 -0.90399401 0.7654386 2500.80099 2252.32583
## 9 9 0.558166370 -1.01932530 0.8101000 1947.09857 1669.86735
## 10 10 0.606907497 -1.11362015 0.8441644 1533.85221 1247.37269
## 11 11 0.639389080 -1.20684654 0.8643031 1221.01785 938.57902
## 12 12 0.665027881 -1.27870865 0.8814067 981.17651 711.38180
## 13 13 0.679616508 -1.34944879 0.8907980 795.19077 541.43573
## 14 14 0.700946489 -1.40230616 0.9057551 649.47429 415.40850
## 15 15 0.714348448 -1.45357032 0.9131524 534.23352 319.80937
## 16 16 0.724948163 -1.49558670 0.9205157 442.31096 247.88717
## 17 17 0.739620984 -1.52325511 0.9305581 368.40984 192.93024
## 18 18 0.752313451 -1.55056501 0.9389893 308.56523 150.95174
## 19 19 0.767718674 -1.57004084 0.9481615 259.77786 118.05191
## 20 20 0.776584732 -1.59281747 0.9528751 219.75689 93.29782
## 21 21 0.784670435 -1.61059715 0.9577007 186.73681 73.99915
## 22 22 0.792617962 -1.62322632 0.9628757 159.34562 58.93977
## 23 23 0.800169526 -1.64007420 0.9669684 136.50889 47.12940
## 24 24 0.805218125 -1.65573482 0.9693642 117.37905 37.91399
## 25 25 0.813842797 -1.66206086 0.9737209 101.28305 30.51697
## 26 26 0.821983494 -1.67186736 0.9764177 87.68295 24.64757
## 27 27 0.828877703 -1.67945788 0.9789746 76.14628 20.01177
## 28 28 0.833878051 -1.68672369 0.9801789 66.32341 16.30415
## 29 29 0.836735192 -1.69244809 0.9827587 57.93014 13.34331
## 30 30 0.839642049 -1.69896991 0.9842438 50.73436 10.94009
## max.k.
## 1 29008.7163
## 2 22564.7847
## 3 18059.9742
## 4 14735.8653
## 5 12199.6187
## 6 10218.0438
## 7 8641.3106
## 8 7377.8395
## 9 6353.1069
## 10 5506.1704
## 11 4799.5795
## 12 4205.1902
## 13 3701.5037
## 14 3271.8625
## 15 2903.1993
## 16 2585.1492
## 17 2309.4092
## 18 2069.2687
## 19 1859.2596
## 20 1674.8933
## 21 1512.4600
## 22 1368.8744
## 23 1241.5555
## 24 1128.3325
## 25 1027.3708
## 26 937.1119
## 27 856.2268
## 28 783.5770
## 29 718.1836
## 30 660.5107
save(mat.z, sft.z, file="2.1.z_WGCNA_ThresholdingPower.UT.RData")