Codes in paper

All codes for scRNA analysis in R

Introduction

  Here codes from P. Bischoff was showed. It concluded all step in analysis of scRNA, which could be refered and learned by all of us.

Explanation of codes

Preprocessing, cell type annotation and subsetting
preprocessing
  • load cellranger count matrices
  • filtering, normalization, dimensionality reduction and clustering
  • manual annotation of main cell types (epithelial, immune stromal)
  • add clinical metadata
  • calculate cell cycle phase
  • subset epithelial, immune and stromal cells
  • plots for figure 1, supp. fig. 1
cell type annotation
  • load epithelial, immune, and stromal subsets
  • rerun PCA, dimensionality reduction and clustering
  • assign tumor and normal cell clusters in epithelial subset
  • calculate expression of marker genes and gene signatures
  • manual annotation of epithelial, immune and stromal cell types
  • plots for supp. fig. 4

Separate analysis of epithelial, immune and stromal subsets

epithelial analyses
  • (use seurat object epi_anno to infer copy number aberrations, code can be found here: https://github.com/bischofp/single_cell_lung_adenocarcinoma)
  • epithelial cell type quantification
  • selected epithelial marker genes
  • subset tumor epithelial cells
  • differential gene expression analysis
  • infer pathway activities
  • rerun PCA and low-dimensional UMAPs
  • plot features of tumor cells sorted along PC1
  • plots for figures 2, supp. fig. 3
stromal analyses
  • stromal cell type quantification
  • selected stromal marker genes
  • differential gene expression analysis
  • infer pathway activities
  • plots for figure 3, supp. fig. 5
immune analyses
  • immune cell type quantification
  • selected immune marker genes
  • differential gene expression analysis
  • calculate gene signature module scores
  • plots for figure 4, supp. fig. 6

Integrative analysis of all subsets

correlation cell types
  • load epithelial, immune and stromal subsets
  • count cell types
  • PCA and correlation analysis of cell counts
  • plots for figures 3, 4, 5, supp. fig. 7
TCGA dataset
  • load TCGA data
  • load epithelial, immune and stromal subsets
  • perform differential gene expression analysis
  • perform ssGSEA on TCGA data using marker genes from scRNAseq
  • correlation analysis of ssGSEA enrichment scores
  • stratify patients based on ssGSEA enrichment scores, perform survival analysis
  • plots for figure 5, supp. fig. 7

#####################preprocessing

### load libraries
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(ggplot2)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(progeny)
library(readr)
library(stringr)

nFeature_lower <- 500
nFeature_upper <- 10000
nCount_lower <- 1000
nCount_upper <- 100000
pMT_lower <- 0
pMT_upper <- 30
pHB_lower <- 0
pHB_upper <- 5

theme_set(theme_cowplot())

#color scheme
use_colors <- c(
  Tumor = "brown2",
  Normal = "deepskyblue2",
  G1 = "#46ACC8",
  G2M = "#E58601",
  S = "#B40F20",
  Epithelial = "seagreen",
  Immune = "darkgoldenrod2",
  Stromal = "steelblue",
  p018 = "#E2D200",
  p019 = "#46ACC8",
  p023 = "#E58601",
  p024 = "#B40F20",
  p027 = "#0B775E",
  p028 = "#E1BD6D",
  p029 = "#35274A",
  p030 = "#F2300F",
  p031 = "#7294D4",
  p032 = "#5B1A18",
  p033 = "#9C964A",
  p034 = "#FD6467")

# Data loading and QC

### sample list
samples <- read_excel("../data/metadata/patients_metadata.xlsx", range = cell_cols("A:A")) %>% .$sample_id

### import cellranger files from different data sets
for (i in seq_along(samples)){
  assign(paste0("scs_data", i), Read10X(data.dir = paste0("../data/cellranger/", samples[i], "/filtered_feature_bc_matrix")))
}

### create seurat objects from cellranger files
for (i in seq_along(samples)){
  assign(paste0("seu_obj", i), CreateSeuratObject(counts = eval(parse(text = paste0("scs_data", i))), project = samples[i], min.cells = 3))
}

### merge data sets
seu_obj <- merge(seu_obj1, y = c(seu_obj2, seu_obj3, seu_obj4, seu_obj5, seu_obj6, seu_obj7, seu_obj8, seu_obj9, seu_obj10, seu_obj11, seu_obj12, seu_obj13, seu_obj14, seu_obj15, seu_obj16, seu_obj17, seu_obj18, seu_obj19, seu_obj20), add.cell.ids = samples, project = "lung")

### calculate mitochondrial, hemoglobin and ribosomal gene counts
seu_obj <- PercentageFeatureSet(seu_obj, pattern = "^MT-", col.name = "pMT")
seu_obj <- PercentageFeatureSet(seu_obj, pattern = "^HBA|^HBB", col.name = "pHB")
seu_obj <- PercentageFeatureSet(seu_obj, pattern = "^RPS|^RPL", col.name = "pRP")

qcparams <- c("nFeature_RNA", "nCount_RNA", "pMT", "pHB", "pRP")
for (i in seq_along(qcparams)){
  print(VlnPlot(object = seu_obj, features = qcparams[i], group.by = "orig.ident", pt.size = 0))
}
for (i in seq_along(qcparams)){
  print(RidgePlot(object = seu_obj, features = qcparams[i], group.by = "orig.ident"))
}

VlnPlot(seu_obj, features = c("nFeature_RNA", "nCount_RNA", "pMT"), pt.size = 0, group.by = "orig.ident", ncol = 1, log = T)
ggsave2("SuppFig1B.pdf", path = "../results", width = 30, height = 20, units = "cm")

### clear environment
remove(seu_obj1)
remove(seu_obj2)
remove(seu_obj3)
remove(seu_obj4)
remove(seu_obj5)
remove(seu_obj6)
remove(seu_obj7)
remove(seu_obj8)
remove(seu_obj9)
remove(seu_obj10)
remove(seu_obj11)
remove(seu_obj12)
remove(seu_obj13)
remove(seu_obj14)
remove(seu_obj15)
remove(seu_obj16)
remove(seu_obj17)
remove(seu_obj18)
remove(seu_obj19)
remove(seu_obj20)

remove(scs_data1)
remove(scs_data2)
remove(scs_data3)
remove(scs_data4)
remove(scs_data5)
remove(scs_data6)
remove(scs_data7)
remove(scs_data8)
remove(scs_data9)
remove(scs_data10)
remove(scs_data11)
remove(scs_data12)
remove(scs_data13)
remove(scs_data14)
remove(scs_data15)
remove(scs_data16)
remove(scs_data17)
remove(scs_data18)
remove(scs_data19)
remove(scs_data20)

# Data Filtering 

qc_std_plot_helper <- function(x) x + 
  scale_color_viridis() +
  geom_point(size = 0.01, alpha = 0.3)

qc_std_plot <- function(seu_obj) {
  qc_data <- as_tibble(FetchData(seu_obj, c("nCount_RNA", "nFeature_RNA", "pMT", "pHB", "pRP")))
  plot_grid(

    qc_std_plot_helper(ggplot(qc_data, aes(log2(nCount_RNA), log2(nFeature_RNA), color = pMT))) + 
      geom_hline(yintercept = log2(nFeature_lower), color = "red", linetype = 2) +
      geom_hline(yintercept = log2(nFeature_upper), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_upper), color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(log2(nCount_RNA), log2(nFeature_RNA), color = pHB))) + 
      geom_hline(yintercept = log2(nFeature_lower), color = "red", linetype = 2) +
      geom_hline(yintercept = log2(nFeature_upper), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_upper), color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(log2(nCount_RNA), log2(nFeature_RNA), color = pRP))) + 
      geom_hline(yintercept = log2(nFeature_lower), color = "red", linetype = 2) +
      geom_hline(yintercept = log2(nFeature_upper), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_upper), color = "red", linetype = 2),

    qc_std_plot_helper(ggplot(qc_data, aes(log2(nCount_RNA), pMT, color = nFeature_RNA))) + 
      geom_hline(yintercept = pMT_lower, color = "red", linetype = 2) +
      geom_hline(yintercept = pMT_upper, color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_upper), color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(log2(nCount_RNA), pHB, color = nFeature_RNA))) + 
      geom_hline(yintercept = pHB_lower, color = "red", linetype = 2) +
      geom_hline(yintercept = pHB_upper, color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_upper), color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(log2(nCount_RNA), pRP, color = nFeature_RNA))) + 
      geom_vline(xintercept = log2(nCount_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nCount_upper), color = "red", linetype = 2),

    qc_std_plot_helper(ggplot(qc_data, aes(log2(nFeature_RNA), pMT, color = nCount_RNA))) + 
      geom_hline(yintercept = pMT_lower, color = "red", linetype = 2) +
      geom_hline(yintercept = pMT_upper, color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nFeature_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nFeature_upper), color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(log2(nFeature_RNA), pHB, color = nCount_RNA))) + 
      geom_hline(yintercept = pHB_lower, color = "red", linetype = 2) +
      geom_hline(yintercept = pHB_upper, color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nFeature_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nFeature_upper), color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(log2(nFeature_RNA), pRP, color = nCount_RNA))) + 
      geom_vline(xintercept = log2(nFeature_lower), color = "red", linetype = 2) +
      geom_vline(xintercept = log2(nFeature_upper), color = "red", linetype = 2),

    qc_std_plot_helper(ggplot(qc_data, aes(pRP, pMT, color = nCount_RNA))) + 
      geom_hline(yintercept = pMT_lower, color = "red", linetype = 2) +
      geom_hline(yintercept = pMT_upper, color = "red", linetype = 2),
    qc_std_plot_helper(ggplot(qc_data, aes(pRP, pMT, color = nFeature_RNA))) + 
      geom_hline(yintercept = pMT_lower, color = "red", linetype = 2) +
      geom_hline(yintercept = pMT_upper, color = "red", linetype = 2),

    ggplot(gather(qc_data, key, value), aes(key, value)) +
      geom_violin() +
      facet_wrap(~key, scales = "free", ncol = 5),

    ncol = 3, align = "hv"
  )
}

## Before filtering

seu_obj_unfiltered <- seu_obj

#saveRDS(seu_obj_unfiltered, file = "seurat_objects/all_unfiltered.RDS")

qc_std_plot(seu_obj_unfiltered)

ggsave2("SuppFig1A.png", path = "../results", width = 30, height = 30, units = "cm")

## After filtering

#seu_obj_unfiltered <- readRDS("all_unfiltered.RDS")

seu_obj <- subset(seu_obj_unfiltered, subset = nFeature_RNA > nFeature_lower & nFeature_RNA < nFeature_upper & nCount_RNA > nCount_lower & nCount_RNA < nCount_upper & pMT < pMT_upper & pHB < pHB_upper)

qc_std_plot(seu_obj)

seu_obj_unfiltered
seu_obj

# Data normalization

seu_obj <- SCTransform(seu_obj, verbose = T, vars.to.regress = c("nCount_RNA", "pMT"), conserve.memory = T)

#saveRDS(seu_obj, file = "seurat_objects/all_SCTransform.RDS")

# Dimensionality reduction

seu_obj <- RunPCA(seu_obj)
ElbowPlot(seu_obj, ndims = 50)

#for (i in c(15, 20)) {
#  umaptest <- RunUMAP(seu_obj, dims = 1:i, verbose = F)
#  print(DimPlot(umaptest, reduction = "umap", group.by = "orig.ident") + labs(title = paste0(i, " dimensions")))
#  print(FeaturePlot(umaptest, features = c("EPCAM", "PTPRC"), sort.cell = T))
#  print(FeaturePlot(umaptest, features = c("MARCO", "KIT"), sort.cell = T))
#  print(FeaturePlot(umaptest, features = c("FOXJ1", "AGER"), sort.cell = T))
#  print(FeaturePlot(umaptest, features = c("JCHAIN", "VWF"), sort.cell = T))
#  remove(umaptest)
#}

seu_obj <- RunUMAP(seu_obj, dims = 1:15, verbose = T)
seu_obj <- FindNeighbors(seu_obj, dims = 1:15)

for (i in c(0.2, 0.3, 0.4, 0.5, 1, 2)) {
  seu_obj <- FindClusters(seu_obj, resolution = i)
  print(DimPlot(seu_obj, reduction = "umap") + labs(title = paste0("resolution: ", i)))
}

for (i in c("nFeature_RNA", "nCount_RNA", "pMT", "pHB", "pRP")) {
  print(FeaturePlot(seu_obj, features = i, coord.fixed = T, sort.cell = T))
}

#DimPlot(seu_obj, group.by = "orig.ident")
#DimPlot(seu_obj, group.by = "SCT_snn_res.0.5", label = T)

# Main cell type annotation

mainmarkers <- c("PECAM1", "VWF", "ACTA2", "JCHAIN", "MS4A1", "PTPRC", "CD68", "KIT", "EPCAM", "CDH1", "KRT7", "KRT19")

for (i in seq_along(mainmarkers)) {
  FeaturePlot(seu_obj, features = mainmarkers[i], coord.fixed = T, order = T, cols = viridis(10))
  #ggsave2(paste0("FeaturePlot_mainmarkers_", mainmarkers[i], ".png"), path = "output/annotation", width = 10, height = 10, units = "cm")
}

DotPlot(seu_obj, features = mainmarkers, group.by = "SCT_snn_res.0.2") + 
  coord_flip() + 
  scale_color_viridis()
#ggsave2("DotPlot_mainmarkers.png", path = "output/annotation", width = 30, height = 8, units = "cm")

DimPlot(seu_obj, group.by = "SCT_snn_res.0.2", label = T, label.size = 5)
#ggsave2("DimPlot_all_clusters.png", path = "output/annotation", width = 20, height = 20, units = "cm")

Idents(seu_obj) <- seu_obj$SCT_snn_res.0.2
annotation_curated_main <- read_excel("../data/curated_annotation/curated_annotation_main.xlsx")
new_ids_main <- annotation_curated_main$main_cell_type
names(new_ids_main) <- levels(seu_obj)
seu_obj <- RenameIdents(seu_obj, new_ids_main)
seu_obj@meta.data$main_cell_type <- Idents(seu_obj)

# Add metadata

metatable <- read_excel("../data/metadata/patients_metadata.xlsx")

metadata <- FetchData(seu_obj, "orig.ident")
metadata$cell_id <- rownames(metadata)
metadata$sample_id <- metadata$orig.ident
metadata <- left_join(x = metadata, y = metatable, by = "sample_id")
rownames(metadata) <- metadata$cell_id

seu_obj <- AddMetaData(seu_obj, metadata = metadata)

# Cell cycle scoring

### add cell cycle, cc.genes loaded with Seurat

s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

score_cc <- function(seu_obj) {
  seu_obj <- CellCycleScoring(seu_obj, s.genes, g2m.genes)
  seu_obj@meta.data$CC.Diff <- seu_obj@meta.data$S.Score - seu_obj@meta.data$G2M.Score
  return(seu_obj)
}

seu_obj <- score_cc(seu_obj)

FeatureScatter(seu_obj, "G2M.Score", "S.Score", group.by = "Phase", pt.size = .1) +
  coord_fixed(ratio = 1)

# Subset, rescale and save RDS files

###subset and rescale

#saveRDS(seu_obj, file = "seurat_objects/all.RDS")

Idents(seu_obj) <- seu_obj@meta.data$main_cell_type

epi <- subset(seu_obj, idents = "Epithelial")
imm <- subset(seu_obj, idents = "Immune")
str <- subset(seu_obj, idents = "Stromal")

epi <- ScaleData(epi)
imm <- ScaleData(imm)
str <- ScaleData(str)

epi
imm
str

#saveRDS(epi, file = "seurat_objects/epi.RDS")
#saveRDS(imm, file = "seurat_objects/imm.RDS")
#saveRDS(str, file = "seurat_objects/str.RDS")

# Plots for figure 1

###r plots for figure 1

DimPlot(seu_obj, group.by = "tissue_type", cols = use_colors, pt.size = 0.1)
ggsave2("Fig1B.png", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(seu_obj, group.by = "patient_id", cols = use_colors, pt.size = 0.1)
ggsave2("Fig1C.png", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(seu_obj, group.by = "main_cell_type", cols = use_colors, pt.size = 0.1)
ggsave2("Fig1D_umap.png", path = "../results", width = 15, height = 15, units = "cm")

cell_types <- FetchData(seu_obj, vars = c("sample_id", "main_cell_type", "tissue_type")) %>% 
  mutate(main_cell_type = factor(main_cell_type, levels = c("Stromal", "Immune", "Epithelial"))) %>% 
  mutate(sample_id = factor(sample_id, levels = rev(c("p018t", "p019t", "p023t", "p024t", "p027t", "p028t", "p030t", "p031t", "p032t", "p033t", "p034t", "p018n", "p019n", "p027n", "p028n", "p029n", "p030n", "p031n", "p032n", "p033n", "p034n"))))

ggplot(data = cell_types) + 
  geom_bar(mapping = aes(x = sample_id, fill = main_cell_type, ), position = "fill", width = 0.75) +
  scale_fill_manual(values = use_colors) +
  coord_flip()
ggsave2("Fig1D_barplot.pdf", path = "../results", width = 15, height = 30, units = "cm")

remove(seu_obj)

##########################################cell type annotation

### load libraries
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(ggplot2)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(readr)
library(stringr)
library(progeny)
library(scales)

theme_set(theme_cowplot())

#color scheme
use_colors <- c(
  Tumor = "brown2",
  Normal = "deepskyblue2",
  G1 = "#46ACC8",
  G2M = "#E58601",
  S = "#B40F20",
  Epithelial = "seagreen",
  Immune = "darkgoldenrod2",
  Stromal = "steelblue",
  p018 = "#E2D200",
  p019 = "#46ACC8",
  p023 = "#E58601",
  p024 = "#B40F20",
  p027 = "#0B775E",
  p028 = "#E1BD6D",
  p029 = "#35274A",
  p030 = "#F2300F",
  p031 = "#7294D4",
  p032 = "#5B1A18",
  p033 = "#9C964A",
  p034 = "#FD6467")

# Load data

#epi <- readRDS("seurat_objects/epi.RDS")
#imm <- readRDS("seurat_objects/imm.RDS")
#str <- readRDS("seurat_objects/str.RDS")

# Rerun PCA, reclustering

### epithelial subclustering

epi <- RunPCA(epi)
ElbowPlot(epi,  ndims = 50)

#for (i in c(10, 15, 20, 25)){
#  umaptest <- RunUMAP(epi, dims = 1:i, verbose = F)
#  print(DimPlot(umaptest, reduction = "umap", group.by = "patient_id", split.by = "tissue_type") + labs(title = paste0(i, "dimensions")))
#  remove(umaptest)
#}

epi <- RunUMAP(epi, dims = 1:20)
epi <- FindNeighbors(epi, dims = 1:20)
for (i in c(0.2, 0.3, 0.4, 0.5, 1, 2)) {
  epi <- FindClusters(epi, resolution = i)
  print(DimPlot(epi, reduction = "umap", label = T) + labs(title = paste0("resolution: ", i)))
}

Idents(epi) <- epi@meta.data$SCT_snn_res.1

### immune subclustering

imm <- RunPCA(imm)
ElbowPlot(imm,  ndims = 50)

#for (i in c(10, 15, 20, 25)){
#  umaptest <- RunUMAP(imm, dims = 1:i, verbose = F)
#  print(DimPlot(umaptest, reduction = "umap", group.by = "patient_id", split.by = "tissue_type") + labs(title = paste0(i, " dimensions")))
#  remove(umaptest)
#}

imm <- RunUMAP(imm, dims = 1:20)
imm <- FindNeighbors(imm, dims = 1:20)
for (i in c(0.2, 0.3, 0.4, 0.5, 1, 2)) {
  imm <- FindClusters(imm, resolution = i)
  print(DimPlot(imm, reduction = "umap") + labs(title = paste0("resolution: ", i)))
}

Idents(imm) <- imm@meta.data$SCT_snn_res.0.5

### stromal sublustering

str <- RunPCA(str)
ElbowPlot(str, ndims = 50)

#for (i in c(5, 10, 15, 20, 25, 30)){
#  umaptest <- RunUMAP(str, dims = 1:i, verbose = F)
#  print(DimPlot(umaptest, reduction = "umap", group.by = "patient_id", split.by = "tissue_type") + labs(title = paste0(i, " dimensions")))
#  print(DimPlot(umaptest, reduction = "umap", group.by = "tissue_type") + labs(title = paste0(i, " dimensions")))
#  remove(umaptest)
#}

str <- RunUMAP(str, dims = 1:20)
str <- FindNeighbors(str, dims = 1:20)
for (i in c(0.2, 0.3, 0.4, 0.5, 1, 2)) {
  str <- FindClusters(str, resolution = i)
  print(DimPlot(str, reduction = "umap") + labs(title = paste0("resolution: ", i)))
}

Idents(str) <- str@meta.data$SCT_snn_res.1

# Define normal and tumor cell clusters

DimPlot(epi, group.by = "SCT_snn_res.1", label = T, repel = T, split.by = "tissue_type")
ggsave2("SuppFig3A.png", path = "../results", width = 30, height = 15, units = "cm")

### compare proportion of cells in a cluster to all epithelial cells for tumor and normal separately, clusters overrepresented in normal samples are supposed to be cell of normal lung parenchyma, all other clusters are supposed to be tumor cells

epi_clusters <- FetchData(epi, vars = c("SCT_snn_res.1", "tissue_type"))

count_tumor <- epi_clusters %>% filter(tissue_type == "Tumor") %>% count() %>% as.numeric()
count_normal <- epi_clusters %>% filter(tissue_type == "Normal") %>% count() %>% as.numeric()

epi_counts <- epi_clusters %>% group_by(tissue_type) %>% count(SCT_snn_res.1)
proportion_tumor <- epi_counts %>% filter(tissue_type == "Tumor") %>% mutate(proportion = n/count_tumor)
proportion_normal <- epi_counts %>% filter(tissue_type == "Normal") %>% mutate(proportion = n/count_normal)

proportion_epi <- full_join(proportion_normal, proportion_tumor, by = "SCT_snn_res.1") %>% 
  mutate(proportion.x = ifelse(is.na(proportion.x), 0,  proportion.x)) %>%  
  mutate(proportion.y = ifelse(is.na(proportion.y), 0,  proportion.y)) %>%
  mutate(tissue_type.x = "Normal") %>%
  mutate(tissue_type.y = "Tumor") %>%
  mutate(cluster_type = ifelse(proportion.x > proportion.y, "Normal", "Tumor"))

cluster_type_data <- left_join(x = epi_clusters, y = proportion_epi, by = "SCT_snn_res.1")
rownames(cluster_type_data) <- rownames(epi_clusters)

epi <- AddMetaData(epi, select(cluster_type_data, cluster_type))

### Bar plot for figure 2

n1 <- select(proportion_epi, c(tissue_type.x, SCT_snn_res.1, proportion.x)) %>%
  mutate(tissue_type = tissue_type.x) %>% 
  mutate(proportion = proportion.x) %>%
  mutate(tissue_type.x = NULL) %>%
  mutate(proportion.x = NULL)
t1 <- select(proportion_epi, c(tissue_type.y, SCT_snn_res.1, proportion.y)) %>%
  mutate(tissue_type = tissue_type.y) %>% 
  mutate(proportion = proportion.y) %>%
  mutate(tissue_type.y = NULL) %>%
  mutate(proportion.y = NULL)

proportion_epi2 <- rbind(n1, t1)

ggplot(proportion_epi2, aes(fill = tissue_type, y = proportion, x = SCT_snn_res.1)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  scale_fill_manual(values = use_colors)

ggsave2("SuppFig3B.pdf", path = "../results", width = 40, height = 20, units = "cm")

# Cell type scoring

myeloid_markers <- c("S100A12", "FCN1", "S100A8", "S100A9", "CD14", "CTSS","VCAN", "LYZ", 
                     "MARCO", "FCGR1A", "C1QA", "APOC1", "LGMN", "CTSB", "FCGR3A", 
                     "MAFB", "MAF", "CX3CR1", "ITGAM", "CSF1R",
                     "FABP4", "MCEMP1", 
                     "IL1B", "CXCL8", 
                     "APOE", "CD163", "C1QB", "C1QC", 
                     "FCER1A", "CD1C", "CLEC9A", 
                     "LILRA4", "CLEC4C", "JCHAIN", "IL3RA", "NRP1", 
                     "CLEC10A", "PTCRA", "CCR7", "LAMP3", 
                     "ITGAX", "CD68", "MKI67", "CDK1", "EPCAM")

tcell_nk_markers <- c("CD3E", "CD4", "FOXP3", "IL7R", "IL2RA", "CD40LG", "CD8A", "CCL5", "NCR1", "NKG7", "GNLY", "NCAM1", "KLRD1", "KLRB1", "CD69", "KLRG1", "MKI67", "CDK1", "EPCAM")

bcell_plasma_mast_markers <- c("MS4A1", "CD19", "CD79A", "JCHAIN", "IGHA1", "IGHA2", "IGHG1", "IGHG2", "IGHG3", "IGHG4", "IGKC", "IGLC2", "IGLC3", "CPA3", "KIT", "MS4A2", "GATA2",  "MKI67", "CDK1", "EPCAM")

DotPlot(imm, features = myeloid_markers, group.by = "SCT_snn_res.0.5") + coord_flip()
#ggsave2("DotPlot_myeloid_markers.png", path = "output", width = 20, height = 20, units = "cm")

DotPlot(imm, features = tcell_nk_markers, group.by = "SCT_snn_res.0.5") + coord_flip()
#ggsave2("DotPlot_T_NK_markers.png", path = "output", width = 20, height = 20, units = "cm")

DotPlot(imm, features = bcell_plasma_mast_markers, group.by = "SCT_snn_res.0.5") + coord_flip()
#ggsave2("DotPlot_B_Plasma_markers.png", path = "output", width = 20, height = 20, units = "cm")

DimPlot(imm, group.by = "SCT_snn_res.0.5", label = T, split.by = "tissue_type")
#ggsave2("DimPlot_immune_clusters.png", path = "output", width = 30, height = 15, units = "cm")

DimPlot(imm, group.by = "patient_id", split.by = "tissue_type", cols = use_colors)
#ggsave2("DimPlot_immune_patients.png", path = "output", width = 30, height = 15, units = "cm")

## Habermann et al.
#https://www.biorxiv.org/content/10.1101/753806v1

habermann_epi <- c("ABCA3", "SFTPB", "SFTPC", "AGER", "PDPN",  "KRT5", "TRP63", "NGFR", "SCGB1A1", "MUC5B", "KRT17", "FOXJ1", "TMEM190", "CAPS", "CHGA", "CALCA", "ASCL1", "PTPRC", "EPCAM")

habermann_imm <- c("CD3E", "CD4", "FOXP3", "IL7R", "IL2RA", "CD40LG", "CD8A", "CCL5", "NCR1", "KLRB1", "NKG7", "LYZ", "CD68", "ITGAX", "MARCO", "FCGR1A", "FCGR3A", "C1QA", "APOC1", "S100A12", "FCN1", "S100A9", "CD14", "FCER1A", "CD1C", "CD16", "CLEC9A", "LILRA4", "CLEC4C", "JCHAIN", "IGHG1", "IGLL5", "MS4A1", "CD19", "CD79A", "CPA3", "KIT", "MKI67", "CDK1", "EPCAM")

habermann_oth <- c("VWF", "PECAM1", "CCL21", "PROX1", "ACTA2", "MYH11", "PDGFRB", "WT1", "UPK3B", "LUM", "PDGFRA", "MYLK", "HAS1", "PLIN2", "FAP", "PTPRC", "EPCAM")

### Epithelial genes according to Habermann et al.

### manual annotation of normal cell types & detection of immune cell contaminated clusters

DotPlot(SubsetData(epi, subset.name = "cluster_type", accept.value = "Normal"), features = habermann_epi, group.by = "SCT_snn_res.1") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

### manual detection of immune cell contaminated clusters

DotPlot(SubsetData(epi, subset.name = "cluster_type", accept.value = "Tumor"), features = habermann_epi, group.by = "SCT_snn_res.1") + theme(axis.text.x = element_text(angle = 90, hjust = 1))

### Immune genes according to Habermann et al.

DotPlot(imm, features = habermann_imm, group.by = "SCT_snn_res.0.5") + coord_flip()

#for (i in seq_along(habermann_imm)) {
#  plotlist <- list()
#  plotlist[1] <- FeaturePlot(imm, features = habermann_imm[i], sort.cell = T, combine = F)
#  plotlist[2] <- VlnPlot(imm, features = habermann_imm[i], pt.size = 0, combine = F)
#  print(CombinePlots(plots = plotlist))
#}

### Stromal genes according to Habermann et al.

DotPlot(str, features = habermann_oth, group.by = "SCT_snn_res.1") + 
  coord_flip()

#for (i in seq_along(habermann_oth)) {
#  plotlist <- list()
#  plotlist[1] <- FeaturePlot(str, features = habermann_oth[i], sort.cell = T, combine = F)
#  plotlist[2] <- VlnPlot(str, features = habermann_oth[i], pt.size = 0, combine = F)
#  print(CombinePlots(plots = plotlist, ncol = 3))
#}

## Travaglini et al.
#https://www.biorxiv.org/content/10.1101/742320v1

#load marker gene lists

sheets <- paste0("Cluster ", c(1:58))
sheets <- sheets[-43]

signaturelist <- list()

for (i in seq_along(sheets)) {
  a <- read_excel("../data/data/media-3.xlsx", sheet = sheets[[i]])
  a <- filter(a, a$...2 > 0.7 & a$...4 < 0.3)
  signaturelist <- c(signaturelist,a[1])
  remove(a)
}

#generate list with names of module scores in seurat object

names_of_modulescores <- c()
for (i in seq_along(signaturelist)){
  names_of_modulescores <- c(names_of_modulescores, paste0("T_", names(signaturelist[i]), i))
}

names_of_modulescores <- gsub(names_of_modulescores, pattern = " ", replacement = ".", fixed = TRUE)
names_of_modulescores <- gsub(names_of_modulescores, pattern = "+", replacement = ".", fixed = TRUE)
names_of_modulescores <- gsub(names_of_modulescores, pattern = "/", replacement = ".", fixed = TRUE)

#names_of_modulescores_unfiltered <- c()
#for (i in seq_along(signaturelist)){
#  names_of_modulescores_unfiltered <- c(names_of_modulescores_unfiltered, paste0(names(signaturelist[i]), "_unfiltered", i))
#}
#names_of_modulescores_unfiltered <- gsub(names_of_modulescores_unfiltered, pattern = " ", replacement = ".", fixed = TRUE)
#names_of_modulescores_unfiltered <- gsub(names_of_modulescores_unfiltered, pattern = "+", replacement = ".", fixed = TRUE)
#names_of_modulescores_unfiltered <- gsub(names_of_modulescores_unfiltered, pattern = "/", replacement = ".", fixed = TRUE)
#signature_list_updated <- list()
#for (i in seq_along(sheets)) {
#  signature_list_updated[[i]] <- checkGeneSymbols(signature_list[[i]])
#}

#calculate module scores for different subsets

epi <- AddModuleScore(object = epi, features = signaturelist, name = paste0("T_", names(signaturelist)))
imm <- AddModuleScore(object = imm, features = signaturelist, name = paste0("T_", names(signaturelist)))
str <- AddModuleScore(object = str, features = signaturelist, nbin = 12 , name = paste0("T_", names(signaturelist)))

## Vieira Braga et al.
#https://www.nature.com/articles/s41591-019-0468-5

#load marker gene lists

teichmann_signatures_epi <- read.csv("../data/data/Fig1_DE_Lung_atlas_epithelial.csv")
teichmann_signatures_epi$gene <- as.character(teichmann_signatures_epi$gene)
teichmann_epi <- levels(teichmann_signatures_epi$cluster)

teichmann_signatures_imm <- read.csv("../data/data/Fig2_DE_Lung_atlas_immune.csv")
teichmann_signatures_imm$gene <- as.character(teichmann_signatures_imm$gene)
teichmann_imm <- levels(teichmann_signatures_imm$cluster)

signaturelist2 <- list()

for (i in seq_along(teichmann_epi)) {
  signaturelist2 <- c(signaturelist2, teichmann_signatures_epi %>% filter(cluster == teichmann_epi[i], pct.2 < 0.3, avg_logFC > 0.7) %>% select(gene))
}

for (i in seq_along(teichmann_imm)) {
  signaturelist2 <- c(signaturelist2, teichmann_signatures_imm %>% filter(cluster == teichmann_imm[i], pct.2 < 0.3, avg_logFC > 0.7) %>% select(gene))
}

names(signaturelist2) <- gsub(c(teichmann_epi, teichmann_imm), pattern = "_", replacement = " ")

#generate list with names of module scores in seurat object

names_of_modulescores2 <- c()
for (i in seq_along(signaturelist2)){
  names_of_modulescores2 <- c(names_of_modulescores2, paste0("VB_", names(signaturelist2[i]), i))
}
names_of_modulescores2 <- gsub(names_of_modulescores2, pattern = " ", replacement = ".", fixed = TRUE)

#calculate module scores for different subsets

epi <- AddModuleScore(epi, features = signaturelist2, name = paste0("VB_", names(signaturelist2)))
imm <- AddModuleScore(imm, features = signaturelist2, name = paste0("VB_", names(signaturelist2)))
str <- AddModuleScore(str, features = signaturelist2, nbin = 12, name = paste0("VB_", names(signaturelist2)))

# Curated cell type annotation and subsetting

###epithelial

annotation_curated_epi <- read_excel("../data/curated_annotation/curated_annotation_epi.xlsx")
epi_anno <- epi
new_ids_epi <- annotation_curated_epi$cell_type_epi
names(new_ids_epi) <- levels(epi_anno)
epi_anno <- RenameIdents(epi_anno, new_ids_epi)
epi_anno@meta.data$cell_type_epi <- Idents(epi_anno)

epi_anno <- subset(epi_anno, subset = cell_type_epi != "Immune_contamination")
epi_anno <- ScaleData(epi_anno)

###immune

annotation_curated_imm <- read_excel("../data/curated_annotation/curated_annotation_imm.xlsx")
imm_anno <- imm
new_ids_imm <- annotation_curated_imm$cell_type_imm
names(new_ids_imm) <- levels(imm_anno)
imm_anno <- RenameIdents(imm_anno, new_ids_imm)
imm_anno@meta.data$cell_type_imm <- Idents(imm_anno)

imm_anno <- subset(imm_anno, subset = cell_type_imm != "Epithelial_contamination")
imm_anno <- ScaleData(imm_anno)

###stromal

annotation_curated_str <- read_excel("../data/curated_annotation/curated_annotation_str.xlsx")
str_anno <- str
new_ids_str <- annotation_curated_str$cell_type_str
names(new_ids_str) <- levels(str_anno)
str_anno <- RenameIdents(str_anno, new_ids_str)
str_anno@meta.data$cell_type_str <- Idents(str_anno)

str_anno <- subset(str_anno, subset = cell_type_str != "Immune/Epithelial contamination")
str_anno <- ScaleData(str_anno)

###Travaglini et al.

names_of_modulescores_original <- names(signaturelist)

#Epithelial

epi_type <- FetchData(epi_anno, vars = c(names_of_modulescores))

for(i in seq_along(names_of_modulescores_original)) {
  colnames(epi_type)[i] <- names_of_modulescores_original[i]
}

epi_type %>%
  merge(FetchData(epi_anno, vars = c("cluster_type", "cell_type_epi")), by = 0) %>%
  filter(cluster_type == "Normal") %>%
  group_by(cell_type_epi) %>%
  pivot_longer(cols = names_of_modulescores_original, names_to = "T_cell_type") %>%
  group_by(cell_type_epi, T_cell_type) %>%
  summarise(mean = mean(value)) %>%
  mutate(mean = rescale(mean)) %>%
  ggplot() +
  geom_tile(aes(x = T_cell_type, y = cell_type_epi, fill = mean))+
  scale_fill_gradientn(colors = c("blue", "white", "red"),
                       breaks = c(0, 1),
                       labels = c("0", "1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 1))
ggsave2("SuppFig5B_epithelial.pdf", path = "../results", width = 30, height = 15, units = "cm")

#Immune

imm_type <- FetchData(imm_anno, vars = c(names_of_modulescores))

for(i in seq_along(names_of_modulescores_original)) {
  colnames(imm_type)[i] <- names_of_modulescores_original[i]
}

imm_type %>%
  merge(FetchData(imm_anno, vars = "cell_type_imm"), by = 0) %>%
  group_by(cell_type_imm) %>%
  pivot_longer(cols = names_of_modulescores_original, names_to = "T_cell_type") %>%
  group_by(cell_type_imm, T_cell_type) %>%
  summarise(mean = mean(value)) %>%
  mutate(mean = rescale(mean)) %>%
  ggplot() +
  geom_tile(aes(x = T_cell_type, y = cell_type_imm, fill = mean))+
  scale_fill_gradientn(colors = c("blue", "white", "red"),
                       breaks = c(0, 1),
                       labels = c("0", "1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 1))
ggsave2("SuppFig5B_immune.pdf", path = "../results", width = 30, height = 30, units = "cm")

#Stromal

str_type <- FetchData(str_anno, vars = c(names_of_modulescores))

for(i in seq_along(names_of_modulescores_original)) {
  colnames(str_type)[i] <- names_of_modulescores_original[i]
}

str_type %>%
  merge(FetchData(str_anno, vars = "cell_type_str"), by = 0) %>%
  group_by(cell_type_str) %>%
  pivot_longer(cols = names_of_modulescores_original, names_to = "T_cell_type") %>%
  group_by(cell_type_str, T_cell_type) %>%
  summarise(mean = mean(value)) %>%
  mutate(mean = rescale(mean)) %>%
  ggplot() +
  geom_tile(aes(x = T_cell_type, y = cell_type_str, fill = mean))+
  scale_fill_gradientn(colors = c("blue", "white", "red"),
                       breaks = c(0, 1),
                       labels = c("0", "1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 1))
ggsave2("SuppFig5B_stromal.pdf", path = "../results", width = 30, height = 20, units = "cm")

###Vieira Braga et al.

names_of_modulescores_original2 <- names(signaturelist2)

#Epithelial

epi_type <- FetchData(epi_anno, vars = c(names_of_modulescores2))

for(i in seq_along(names_of_modulescores_original2)) {
  colnames(epi_type)[i] <- names_of_modulescores_original2[i]
}

epi_type %>%
  merge(FetchData(epi_anno, vars = c("cluster_type", "cell_type_epi")), by = 0) %>%
  filter(cluster_type == "Normal") %>%
  group_by(cell_type_epi) %>%
  pivot_longer(cols = names_of_modulescores_original2, names_to = "VB_cell_type") %>%
  group_by(cell_type_epi, VB_cell_type) %>%
  summarise(mean = mean(value)) %>%
  mutate(mean = rescale(mean)) %>%
  ggplot() +
  geom_tile(aes(x = VB_cell_type, y = cell_type_epi, fill = mean))+
  scale_fill_gradientn(colors = c("blue", "white", "red"),
                       breaks = c(0, 1),
                       labels = c("0", "1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 1))
ggsave2("SuppFig5A_epithelial.pdf", path = "../results", width = 20, height = 10, units = "cm")

#Immune

imm_type <- FetchData(imm_anno, vars = c(names_of_modulescores2))

for(i in seq_along(names_of_modulescores_original2)) {
  colnames(imm_type)[i] <- names_of_modulescores_original2[i]
}

imm_type %>%
  merge(FetchData(imm_anno, vars = "cell_type_imm"), by = 0) %>%
  group_by(cell_type_imm) %>%
  pivot_longer(cols = names_of_modulescores_original2, names_to = "VB_cell_type") %>%
  group_by(cell_type_imm, VB_cell_type) %>%
  summarise(mean = mean(value)) %>%
  mutate(mean = rescale(mean)) %>%
  ggplot() +
  geom_tile(aes(x = VB_cell_type, y = cell_type_imm, fill = mean))+
  scale_fill_gradientn(colors = c("blue", "white", "red"),
                       breaks = c(0, 1),
                       labels = c("0", "1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 1))
ggsave2("SuppFig5A_immune.pdf", path = "../results", width = 20, height = 30, units = "cm")

#Stromal

str_type <- FetchData(str_anno, vars = c(names_of_modulescores2))

for(i in seq_along(names_of_modulescores_original2)) {
  colnames(str_type)[i] <- names_of_modulescores_original2[i]
}

str_type %>%
  merge(FetchData(str_anno, vars = "cell_type_str"), by = 0) %>%
  group_by(cell_type_str) %>%
  pivot_longer(cols = names_of_modulescores_original2, names_to = "VB_cell_type") %>%
  group_by(cell_type_str, VB_cell_type) %>%
  summarise(mean = mean(value)) %>%
  mutate(mean = rescale(mean)) %>%
  ggplot() +
  geom_tile(aes(x = VB_cell_type, y = cell_type_str, fill = mean))+
  scale_fill_gradientn(colors = c("blue", "white", "red"),
                       breaks = c(0, 1),
                       labels = c("0", "1")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 1))
ggsave2("SuppFig5A_stromal.pdf", path = "../results", width = 20, height = 20, units = "cm")

#hallmark signatures

broad_pws <- read_lines("../data/data/h.all.v6.2.symbols.gmt") %>%
  lapply(str_split, "\\t") %>% 
  unlist(recursive = F) %>% 
  lapply(function(x) setNames(list(x[-c(1:2)]), x[1])) %>% 
  unlist(recursive = F)

epi_anno <- AddModuleScore(object = epi_anno, features = broad_pws, name = names(broad_pws))
imm_anno <- AddModuleScore(object = imm_anno, features = broad_pws, name = names(broad_pws))
str_anno <- AddModuleScore(object = str_anno, features = broad_pws, name = names(broad_pws), nbin = 12)

#progeny signatures

epi_anno <- progeny(epi_anno, scale = F, organism="Human", top=500, perm=1, return_assay=T)
epi_anno <- ScaleData(epi_anno, assay = "progeny")

### obviously too many cells for progeny function, progeny scores added after immune cells split into myeloid and lymphoid subset
#imm_anno <- progeny(imm_anno, scale = F, organism="Human", top=500, perm=1, return_assay=T)
#imm_anno <- ScaleData(imm_anno, assay = "progeny")

str_anno <- progeny(str_anno, scale = F, organism="Human", top=500, perm=1, return_assay=T)
str_anno <- ScaleData(str_anno, assay = "progeny")

#saveRDS(epi_anno, file = "seurat_objects/epi_anno.RDS")
#saveRDS(imm_anno, file = "seurat_objects/imm_anno.RDS")
#saveRDS(str_anno, file = "seurat_objects/str_anno.RDS")
remove(epi)
remove(imm)
remove(str)

#########################################epithelial analyses

### load libraries
library(ggplot2)
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(stringr)
library(cowplot)
library(scales)
library(tibble)
library(gplots)
library(RColorBrewer)

theme_set(theme_cowplot())

#color scheme
use_colors <- c(
  Tumor = "brown2",
  Normal = "deepskyblue2",
  G1 = "#46ACC8",
  G2M = "#E58601",
  S = "#B40F20",
  Epithelial = "seagreen",
  Immune = "darkgoldenrod2",
  Stromal = "steelblue",
  p018 = "#E2D200",
  p019 = "#46ACC8",
  p023 = "#E58601",
  p024 = "#B40F20",
  p027 = "#0B775E",
  p028 = "#E1BD6D",
  p029 = "#35274A",
  p030 = "#F2300F",
  p031 = "#7294D4",
  p032 = "#5B1A18",
  p033 = "#9C964A",
  p034 = "#FD6467",
  AT1 = "#2B8CBE",
  AT2 = "#045A8D",
  Club = "#006D2C",
  DifferentiatingCiliated = "#31A354",
  Ciliated = "#74C476",
  Neuroendocrine = "#8856A7",
  lepidic = "#0B775E",
  acinar = "#74A089",
  mucinuous (papillary) = "#E2D200",
  (micro)papillary = "#CEAB07",
  solid = "#B40F20",
  sarcomatoid = "#5B1A18",
  CNN = "chartreuse4",
  CNA = "orange")

#load data

#epi_anno <- readRDS("seurat_objects/epi_anno.RDS")

epi_anno@meta.data$cell_type_epi <- factor(epi_anno@meta.data$cell_type_epi, levels = c("AT2",
                                                                                        "AT1",
                                                                                        "Club",
                                                                                        "Ciliated",
                                                                                        "Neuroendocrine",
                                                                                        "Tumor"))

#add inferCNV clone scores (for inferCNV, use seurat object epi_anno and the following code: https://github.com/bischofp/single_cell_lung_adenocarcinoma, computation time ~12h)

scna_scores <- read.delim("../data/inferCNV_output/infercnv_clone_scores_nsclc.tsv") %>% filter(tissue_type == "Tumor") %>% filter(!is.na(cna_clone))
rownames(scna_scores) <- scna_scores$cell_id
scna_scores <- scna_scores %>% select(cna_clone) %>% mutate(cna_clone = as.character(cna_clone))
epi_anno <- AddMetaData(epi_anno, scna_scores)

scna_data <- FetchData(epi_anno, c("tissue_type", "cna_clone"))
scna_data <- scna_data %>% mutate(cna_clone = ifelse(is.na(cna_clone), "CNN", cna_clone))
epi_anno <- AddMetaData(epi_anno, scna_data)

#some plots

DotPlot(SubsetData(epi_anno, subset.name = "cluster_type", accept.value = "Normal"), features = c("ABCA3", "SFTPC", "AGER", "PDPN",  "KRT5", "TRP63", "NGFR", "SCGB1A1", "MUC5B", "FOXJ1", "TMEM190", "CHGA", "CALCA"), group.by = "cell_type_epi") + 
  coord_flip() + 
  scale_color_viridis()
#ggsave2("DotPlot_markergenes_epi_cell_type.emf", path = "../results", width = 11, height = 8, units = "cm")
ggsave2("Fig2B.png", path = "../results", width = 11, height = 8, units = "cm")

DimPlot(epi_anno, group.by = "cell_type_epi", cols = use_colors, pt.size = 0.5)
ggsave2("Fig2A_celltype.png", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(epi_anno, group.by = "patient_id", cols = use_colors, pt.size = 0.5)
ggsave2("Fig2A_patients.png", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(epi_anno, group.by = "tissue_type", cols = use_colors, pt.size = 0.5)
ggsave2("Fig2A_tissuetype.png", path = "../results", width = 15, height = 15, units = "cm")

epi_cell_counts <- FetchData(epi_anno, vars = c("tissue_type", "cell_type_epi", "cna_clone", "cluster_type")) %>%
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal")))

ggplot(data = epi_cell_counts, aes(x = tissue_type, fill = cell_type_epi)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  scale_y_reverse() +
  coord_flip()
ggsave2("Fig2A_barplot.pdf", path = "../results", width = 20, height = 5, units = "cm")

ggplot(data = epi_cell_counts, aes(x = tissue_type, fill = cna_clone)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  scale_y_reverse() +
  coord_flip()
ggsave2("SuppFig4_barplot_cna_clone.pdf", path = "../results", width = 20, height = 5, units = "cm")

ggplot(data = epi_cell_counts, aes(x = tissue_type, fill = cluster_type)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("cyan3", "darkorange2")) +
  coord_flip()
ggsave2("SuppFig4_barplot_cluster_type.pdf", path = "../results", width = 20, height = 5, units = "cm")

DimPlot(epi_anno, group.by = "cna_clone", cols = use_colors, pt.size = 0.5)
ggsave2("SuppFig4_umap_cna_clone.png", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(epi_anno, group.by = "cluster_type", cols = c("cyan3", "darkorange2"), pt.size = 0.5)
ggsave2("SuppFig4_umap_cluster_type.png", path = "../results", width = 15, height = 15, units = "cm")

###subset tumor epithelial cells

epi_tumor <- SubsetData(SubsetData(epi_anno, subset.name = "cluster_type", accept.value = "Tumor"), subset.name = "tissue_type", accept.value = "Tumor")

epi_tumor <- ScaleData(epi_tumor)

###tumor cell marker genes

Idents(epi_tumor) <- epi_tumor@meta.data$patient_id

markers <- FindAllMarkers(epi_tumor, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)

top_TC_markers <- markers %>% group_by(cluster) %>% top_n(10, wt = avg_logFC)

DoHeatmap(epi_tumor, features = top_TC_markers$gene, group.by = "patient_id", draw.lines = F, group.colors = use_colors) +
  scale_fill_viridis()
#ggsave2("HeatMap_Tumor.pdf", path = "output/fig2", width = 30, height = 30, units = "cm")
ggsave2("Fig2C.png", path = "../results", width = 30, height = 30, units = "cm")

###mitotic activity

mitotic_activity <- FetchData(epi_tumor, c("tissue_type", "cell_type_epi", "Phase", "sample_id")) %>%
  mutate(sample_id = factor(sample_id, levels = c("p034t", "p033t", "p032t", "p031t", "p030t", "p027t", "p024t", "p023t", "p019t", "p018t")))

ggplot(mitotic_activity, aes(x = sample_id, fill = Phase)) +
  geom_bar(position = "fill", width = 0.75) +
  scale_fill_manual(values = use_colors) +
  coord_flip()
ggsave2("SuppFig3D.pdf", path = "../results", width = 12, height = 10, units = "cm")

###progeny pathway scores

#clustered heatmap progeny scores

progeny_scores <- as.data.frame(t(GetAssayData(epi_tumor, assay = "progeny", slot = "scale.data")))
progeny_scores$cell_id <- rownames(progeny_scores)
progeny_scores <- gather(progeny_scores, Pathway, Activity, -cell_id)

cells_clusters <- FetchData(epi_tumor, c("sample_id", "cluster_type")) %>% filter(str_detect(sample_id, "t"))
cells_clusters$cell_id <- rownames(cells_clusters)

progeny_scores <- inner_join(progeny_scores, cells_clusters)

summarized_progeny_scores <- progeny_scores %>% 
  group_by(Pathway, sample_id) %>% 
  summarise(avg = mean(Activity), std = sd(Activity)) %>%
  pivot_wider(id_cols = Pathway, names_from = sample_id, values_from = avg) %>%
  column_to_rownames("Pathway") %>%
  as.matrix()

pdf("../results/Fig2E.pdf", width = 6, height = 8)
heatmap.2(summarized_progeny_scores, trace = "none", density.info = "none", col = bluered(100))
dev.off()

###correlation mutational status ~ progeny scores

summarized_progeny_scores_mutations <- summarized_progeny_scores %>%
  t() %>%
  as.data.frame()

summarized_progeny_scores_mutations$Sample <- rownames(summarized_progeny_scores_mutations)
summarized_progeny_scores_mutations <- summarized_progeny_scores_mutations %>%
  mutate(KRAS_status = ifelse(Sample %in% c("p018t", "p023t", "p030t", "p031t", "p032t", "p033t"), "mutated", "wildtype")) %>%
  mutate(TP53_status = ifelse(Sample %in% c("p023t", "p027t"), "mutated", "wildtype")) %>%
  mutate(PIK3CA_status = ifelse(Sample %in% c("p031t"), "mutated", "wildtype")) %>%
  mutate(KRAS_status = factor(TP53_status, levels = c("wildtype", "mutated"))) %>%
  mutate(TP53_status = factor(TP53_status, levels = c("wildtype", "mutated")))

ggplot(summarized_progeny_scores_mutations, aes(x = KRAS_status, y = MAPK)) +
  geom_boxplot() +
  ggtitle(paste0(t.test(formula = MAPK~KRAS_status, data = summarized_progeny_scores_mutations, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig3C_KRAS~MAPK.pdf", path = "../results", width = 8, height = 8, units = "cm")

ggplot(summarized_progeny_scores_mutations, aes(x = KRAS_status, y = EGFR)) +
  geom_boxplot() +
  ggtitle(paste0(t.test(formula = EGFR~KRAS_status, data = summarized_progeny_scores_mutations, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig3C_KRAS~EGFR.pdf", path = "../results", width = 8, height = 8, units = "cm")

ggplot(summarized_progeny_scores_mutations, aes(x = TP53_status, y = p53)) +
  geom_boxplot() +
  ggtitle(paste0(t.test(formula = p53~TP53_status, data = summarized_progeny_scores_mutations, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig3C_p53~TP53.pdf", path = "../results", width = 8, height = 8, units = "cm")

### rerun PCA

epi_pca <- epi_tumor

epi_pca <- RunPCA(epi_pca)

DimPlot(epi_pca, reduction = "pca", group.by = "patient_id", dims = c(1,2))
DimPlot(epi_pca, reduction = "pca", group.by = "patient_id", dims = c(3,4))

DimPlot(epi_pca, reduction = "pca", group.by = "histo_subtype")

DimHeatmap(epi_pca, dims = 1, cells = 1000, balanced = T, fast = F, nfeatures = 60) +
  scale_fill_viridis()
#ggsave2("DimHeatmap_epitumor_PC1.pdf", path = "output/fig2", width = 10, height = 20, units = "cm")
ggsave2("SuppFig3F.png", path = "../results", width = 10, height = 20, units = "cm")

### low-dimensional UMAPs

DimPlot(epi_pca, group.by = "patient_id")
DimPlot(epi_pca, group.by = "histo_subtype")

for (i in c(4, 6, 8)){
  umaptest <- RunUMAP(epi_pca, dims = 1:i, verbose = F)
  print(DimPlot(umaptest, reduction = "umap", group.by = "patient_id", cols = use_colors) + labs(title = paste0(i, " dimensions")) + coord_fixed())
  ggsave2(paste0("SuppFig3E_", i, "dimensions_patient.png"), path = "../results", width = 15, height = 10, units = "cm")
  print(DimPlot(umaptest, reduction = "umap", group.by = "histo_subtype", cols = use_colors) + labs(title = paste0(i, " dimensions")) + coord_fixed())
  ggsave2(paste0("SuppFig3E_", i, "dimensions_histo.png"), path = "../results", width = 15, height = 10, units = "cm")
  print(FeaturePlot(umaptest, features = "PC_1", order = T, cols = viridis(10)) + labs(title = paste0(i, " dimensions")) + coord_fixed())
  ggsave2(paste0("SuppFig3E_", i, "dimensions_PC1.png"), path = "../results", width = 15, height = 10, units = "cm")
  remove(umaptest)
}

### bin cells along differentation gradients

# tumor cell signatures "alveolar/club-like" = tumor_signature, "undifferentiated" = anti_tumor_signature

epi_pca <- AddModuleScore(epi_pca, features = list(c("SCGB3A1", "PIGR", "NAPSA", "C4BPA", "SCGB3A2", "HLA-DRA", "CD74", "ADGRF5", "C16orf89", "FOLR1", "SELENBP1", "HLA-DRB1", "ID4", "MGP", "AQP3", "CA2", "LHX9", "HLA-DPB1", "FMO5", "GKN2", "C5", "MUC1", "NPC2", "RNASE1", "PIK3C2G", "SFTA2", "SLC34A2", "HLA-DPA1", "FGFR3", "PGC")), name = "tumor_signature")
epi_pca <- AddModuleScore(epi_pca, features = list(c("DSG2", "CAMK2N1", "FAM3C", "KRT7", "IFI27", "SLC2A1", "MARCKS", "PLAU", "AHNAK2", "PERP", "S100A4", "KRT19", "COL6A1", "UACA", "COL17A1", "CDA", "TPM2", "S100A16", "KRT8", "PRSS23", "DST", "LAMC2", "S100P", "PRSS3", "LAMA3", "DSP", "ITGA3", "MDK", "FAM83A", "ITGB4")), name = "anti_tumor_signature")

epi_tumor_data <- FetchData(epi_pca, c(colnames(epi_pca@meta.data), paste0("PC_", 1:10)))

progeny_scores_data <- as.data.frame(t(GetAssayData(epi_tumor, assay = "progeny", slot = "scale.data")))
progeny_scores_data$cell_id <- rownames(progeny_scores_data)

epi_tumor_data <- full_join(epi_tumor_data, progeny_scores_data, by = "cell_id")

progeny_names <- rownames(epi_pca@assays$progeny)

epi_tumor_data$bin <- cut_number(epi_tumor_data$PC_1, n = 10, labels = c(1:10))

ggplot(epi_tumor_data, mapping = aes(x = bin)) +
  geom_bar()

# patients along PC1

ggplot(epi_tumor_data, mapping = aes(x = bin, fill = factor(patient_id, levels = c("p032", "p018", "p019", "p024", "p031", "p030", "p033", "p023", "p027", "p034")))) +
  geom_bar() +
  theme(legend.title = element_blank()) +
  scale_fill_manual(values = use_colors)
ggsave2("SuppFig3G.pdf", path = "../results", width = 30, height = 30, units = "cm")

# histological subtypes along PC1

ggplot(epi_tumor_data, mapping = aes(x = bin, fill = factor(histo_subtype, levels = c("lepidic", "acinar", "mucinuous (papillary)", "(micro)papillary", "solid", "sarcomatoid")))) +
  geom_bar() +
  theme(legend.title = element_blank()) +
  scale_fill_manual(values = use_colors)
ggsave2("Fig2F.pdf", path = "../results", width = 30, height = 30, units = "cm")

# cell cycle phase along PC1

ggplot(epi_tumor_data, mapping = aes(x = bin, fill = Phase)) +
  geom_bar() +
  scale_fill_manual(values = use_colors)
ggsave2("SuppFig3H.pdf", path = "../results", width = 30, height = 30, units = "cm")

# normal cell type signatures along PC1

epi_tumor_data %>%
  select(c(bin, VB_Basal.11, VB_Basal.22, VB_Ciliated.13, VB_Ciliated.24, VB_Club5, VB_Goblet.26, VB_Goblet.17, VB_Ionocytes8, VB_Type.2.alveolar9, VB_Type.1.alveolar10)) %>%
  pivot_longer(cols = c(VB_Basal.11, VB_Basal.22, VB_Ciliated.13, VB_Ciliated.24, VB_Club5, VB_Goblet.26, VB_Goblet.17, VB_Ionocytes8, VB_Type.2.alveolar9, VB_Type.1.alveolar10), names_to = "cell_type") %>%
  group_by(bin, cell_type) %>%
  summarise(mean = mean(value), sd = sd(value)) %>%
  ggplot(aes(x = as.numeric(bin), y = mean, color = cell_type)) +
  geom_line(size = 1) +
  scale_color_brewer(type = "qual", palette = "Paired") +
  scale_x_continuous(breaks = c(1:20))
ggsave2("Fig2G.pdf", path = "../results", width = 30, height = 30, units = "cm")

# progeny pathway scores along PC1

getcolors <- colorRampPalette(brewer.pal(14, "Dark2"))

epi_tumor_data %>%
  select(bin, progeny_names) %>%
  pivot_longer(cols = progeny_names, names_to = "progeny") %>%
  group_by(bin, progeny) %>%
  summarise(mean = mean(value), sd = sd(value)) %>%
  ggplot(aes(x = as.numeric(bin), y = mean, color = progeny)) +
  geom_line(size = 1) +
  scale_color_manual(values = getcolors(14)) +
  scale_x_continuous(breaks = c(1:20))
ggsave2("Fig2H.pdf", path = "../results", width = 30, height = 30, units = "cm")

# tumor cell signatures along PC1

epi_tumor_data %>%
  select(bin, tumor_signature1, anti_tumor_signature1) %>%
  pivot_longer(cols = c(tumor_signature1, anti_tumor_signature1), names_to = "genes") %>%
  group_by(bin, genes) %>%
  summarise(mean = mean(value), sd = sd(value)) %>%
  ggplot(aes(x = as.numeric(bin), y = mean, color = genes)) +
  geom_line(size = 1) +
  geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width = 0.3) +
  scale_x_continuous(breaks = c(1:20))
ggsave2("SuppFig3I.pdf", path = "../results", width = 30, height = 30, units = "cm")

#######################################stromal analyses

### load libraries
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(ggplot2)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(readr)
library(stringr)
library(gplots)
library(grid)
library(rlang)
library(tibble)

theme_set(theme_cowplot())

#color scheme
use_colors <- c(
  Tumor = "brown2",
  Normal = "deepskyblue2",
  G1 = "#46ACC8",
  G2M = "#E58601",
  S = "#B40F20",
  Epithelial = "seagreen",
  Immune = "darkgoldenrod2",
  Stromal = "steelblue",
  p018 = "#E2D200",
  p019 = "#46ACC8",
  p023 = "#E58601",
  p024 = "#B40F20",
  p027 = "#0B775E",
  p028 = "#E1BD6D",
  p029 = "#35274A",
  p030 = "#F2300F",
  p031 = "#7294D4",
  p032 = "#5B1A18",
  p033 = "#9C964A",
  p034 = "#FD6467",
  Endothelial1 = "#FED976",
  Endothelial2 = "#FEB24C",
  Endothelial3 = "#fd8d3C",
  Endothelial4 = "#FC4E2A",
  Endothelial5 = "#E31A1C",
  Endothelial6 = "#BD0026",
  Endothelial7 = "#800026",
  Lymphaticendothelial = "salmon",
  Fibroblast1 = "#2166AC",
  Fibroblast2 = "#4393C3",
  Myofibroblast1 = "#5AAE61",
  Myofibroblast2 = "#1B7837",
  Smoothmuscle1 = "#9970AB",
  Smoothmuscle2 = "#762A83",
  Mesothelial = "#40004B")

#str_anno <- readRDS("seurat_objects/str_anno.RDS")

str_anno@meta.data$cell_type_str <- factor(str_anno@meta.data$cell_type_str, levels = c("Endothelial1",
                                                                                        "Endothelial2",
                                                                                        "Endothelial3",
                                                                                        "Endothelial4",
                                                                                        "Endothelial5",
                                                                                        "Endothelial6",
                                                                                        "Endothelial7",
                                                                                        "Lymphaticendothelial",
                                                                                        "Fibroblast1",
                                                                                        "Fibroblast2",
                                                                                        "Myofibroblast1",
                                                                                        "Myofibroblast2",
                                                                                        "Smoothmuscle1",
                                                                                        "Smoothmuscle2",
                                                                                        "Mesothelial"))

DimPlot(str_anno, group.by = "tissue_type", cols = use_colors)
#ggsave2("DimPlot_str_Normal_Tumor.pdf", path = "output/fig3", width = 15, height = 15, units = "cm")

DimPlot(str_anno, group.by = "patient_id", cols = use_colors, pt.size = 0.5)
ggsave2("SuppFig1C_str_patients.pdf", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(str_anno, group.by = "cell_type_str", label = F, split.by = "tissue_type", cols = use_colors, pt.size = 0.5)
ggsave2("Fig3A_umap.pdf", path = "../results", width = 30, height = 15, units = "cm")

DotPlot(str_anno, features = c("WT1", "UPK3B", "MYH11", "PDGFRB", "ACTA2", "MYLK", "LUM", "PDGFRA", "CCL21", "PROX1", "PECAM1", "VWF"), group.by = "cell_type_str") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip() + 
  scale_color_viridis()
ggsave2("Fig3B.pdf", path = "../results", width = 16, height = 12, units = "cm")

###subsetting

str_endo <- subset(str_anno, subset = cell_type_str %in% c("Endothelial1",
                                                           "Endothelial2",
                                                           "Endothelial3",
                                                           "Endothelial4",
                                                           "Endothelial5",
                                                           "Endothelial6",
                                                           "Endothelial7",
                                                           "Lymphaticendothelial"))
str_endo <- ScaleData(str_endo)

str_fibro <- subset(str_anno, subset = cell_type_str %in% c("Fibroblast1",
                                                            "Fibroblast2",
                                                            "Myofibroblast1",
                                                            "Myofibroblast2",
                                                            "Smoothmuscle1",
                                                            "Smoothmuscle2",
                                                            "Mesothelial"))

str_fibro <- ScaleData(str_fibro)

endo_counts <- FetchData(str_endo, vars = c("tissue_type", "cell_type_str", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal")))

endo_counts_tbl <- endo_counts %>%
  dplyr::count(cell_type_str, patient_id, tissue_type)
write_csv(endo_counts_tbl, path = "../results/SuppTable1.csv")

fibro_counts <- FetchData(str_fibro, vars = c("tissue_type", "cell_type_str", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal"))) 

fibro_counts_tbl <- fibro_counts %>%
  dplyr::count(cell_type_str, patient_id, tissue_type)
write_csv(fibro_counts_tbl, path = "../results/SuppTable2.csv")

ggplot(data = endo_counts, aes(x = tissue_type, fill = cell_type_str)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig3A_barplot_endothelial.pdf", path = "../results", width = 20, height = 5, units = "cm")

ggplot(data = fibro_counts, aes(x = tissue_type, fill = cell_type_str)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig3A_barplot_fibroblastic.pdf", path = "../results", width = 20, height = 5, units = "cm")

endo_counts %>%
  filter(tissue_type == "Tumor") %>%
  ggplot(aes(x = sample_id, fill = cell_type_str)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig3A_barplot_endothelial_per_patient.pdf", path = "../results", width = 30, height = 30, units = "cm")

fibro_counts %>%
  filter(tissue_type == "Tumor") %>%
  ggplot(aes(x = sample_id, fill = cell_type_str)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig3A_barplot_fibroblastic_per_patient.pdf", path = "../results", width = 30, height = 30, units = "cm")

endo_counts %>%
  filter(tissue_type == "Normal") %>%
  ggplot(aes(x = sample_id, fill = cell_type_str)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("SuppFig6A_endothelial.pdf", path = "../results", width = 30, height = 30, units = "cm")

fibro_counts %>%
  filter(tissue_type == "Normal") %>%
  ggplot(aes(x = sample_id, fill = cell_type_str)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("SuppFig6A_fibroblastic.pdf", path = "../results", width = 30, height = 30, units = "cm")

#heatmap wrapper function
###code from https://github.com/satijalab/seurat/issues/2201

DoMultiBarHeatmap <- function (object, 
                               features = NULL, 
                               cells = NULL, 
                               group.by = "ident", 
                               additional.group.by = NULL, 
                               additional.group.sort.by = NULL, 
                               cols.use = NULL,
                               group.bar = TRUE, 
                               disp.min = -2.5, 
                               disp.max = NULL, 
                               slot = "scale.data", 
                               assay = NULL, 
                               label = TRUE, 
                               size = 5.5, 
                               hjust = 0, 
                               angle = 45, 
                               raster = TRUE, 
                               draw.lines = TRUE, 
                               lines.width = NULL, 
                               group.bar.height = 0.02, 
                               combine = TRUE) 
{
  cells <- cells %||% colnames(x = object)
  if (is.numeric(x = cells)) {
    cells <- colnames(x = object)[cells]
  }
  assay <- assay %||% DefaultAssay(object = object)
  DefaultAssay(object = object) <- assay
  features <- features %||% VariableFeatures(object = object)
  ## Why reverse???
  features <- rev(x = unique(x = features))
  disp.max <- disp.max %||% ifelse(test = slot == "scale.data", 
                                   yes = 2.5, no = 6)
  possible.features <- rownames(x = GetAssayData(object = object, 
                                                 slot = slot))
  if (any(!features %in% possible.features)) {
    bad.features <- features[!features %in% possible.features]
    features <- features[features %in% possible.features]
    if (length(x = features) == 0) {
      stop("No requested features found in the ", slot, 
           " slot for the ", assay, " assay.")
    }
    warning("The following features were omitted as they were not found in the ", 
            slot, " slot for the ", assay, " assay: ", paste(bad.features, 
                                                             collapse = ", "))
  }

  if (!is.null(additional.group.sort.by)) {
    if (any(!additional.group.sort.by %in% additional.group.by)) {
      bad.sorts <- additional.group.sort.by[!additional.group.sort.by %in% additional.group.by]
      additional.group.sort.by <- additional.group.sort.by[additional.group.sort.by %in% additional.group.by]
      if (length(x = bad.sorts) > 0) {
        warning("The following additional sorts were omitted as they were not a subset of additional.group.by : ", 
                paste(bad.sorts, collapse = ", "))
      }
    }
  }

  data <- as.data.frame(x = as.matrix(x = t(x = GetAssayData(object = object, 
                                                             slot = slot)[features, cells, drop = FALSE])))

  object <- suppressMessages(expr = StashIdent(object = object, 
                                               save.name = "ident"))
  group.by <- group.by %||% "ident"
  groups.use <- object[[c(group.by, additional.group.by[!additional.group.by %in% group.by])]][cells, , drop = FALSE]
  plots <- list()
  for (i in group.by) {
    data.group <- data
    if (!is_null(additional.group.by)) {
      additional.group.use <- additional.group.by[additional.group.by!=i]  
      if (!is_null(additional.group.sort.by)){
        additional.sort.use = additional.group.sort.by[additional.group.sort.by != i]  
      } else {
        additional.sort.use = NULL
      }
    } else {
      additional.group.use = NULL
      additional.sort.use = NULL
    }

    group.use <- groups.use[, c(i, additional.group.use), drop = FALSE]

    for(colname in colnames(group.use)){
      if (!is.factor(x = group.use[[colname]])) {
        group.use[[colname]] <- factor(x = group.use[[colname]])
      }  
    }

    if (draw.lines) {
      lines.width <- lines.width %||% ceiling(x = nrow(x = data.group) * 
                                                0.0025)
      placeholder.cells <- sapply(X = 1:(length(x = levels(x = group.use[[i]])) * 
                                           lines.width), FUN = function(x) {
                                             return(Seurat:::RandomName(length = 20))
                                           })
      placeholder.groups <- data.frame(rep(x = levels(x = group.use[[i]]), times = lines.width))
      group.levels <- list()
      group.levels[[i]] = levels(x = group.use[[i]])
      for (j in additional.group.use) {
        group.levels[[j]] <- levels(x = group.use[[j]])
        placeholder.groups[[j]] = NA
      }

      colnames(placeholder.groups) <- colnames(group.use)
      rownames(placeholder.groups) <- placeholder.cells

      group.use <- sapply(group.use, as.vector)
      rownames(x = group.use) <- cells

      group.use <- rbind(group.use, placeholder.groups)

      for (j in names(group.levels)) {
        group.use[[j]] <- factor(x = group.use[[j]], levels = group.levels[[j]])
      }

      na.data.group <- matrix(data = NA, nrow = length(x = placeholder.cells), 
                              ncol = ncol(x = data.group), dimnames = list(placeholder.cells, 
                                                                           colnames(x = data.group)))
      data.group <- rbind(data.group, na.data.group)
    }

    order_expr <- paste0('order(', paste(c(i, additional.sort.use), collapse=','), ')')
    group.use = with(group.use, group.use[eval(parse(text=order_expr)), , drop=F])

    plot <- Seurat:::SingleRasterMap(data = data.group, raster = raster, 
                                     disp.min = disp.min, disp.max = disp.max, feature.order = features, 
                                     cell.order = rownames(x = group.use), group.by = group.use[[i]])

    if (group.bar) {
      pbuild <- ggplot_build(plot = plot)
      group.use2 <- group.use
      cols <- list()
      na.group <- Seurat:::RandomName(length = 20)
      for (colname in rev(x = colnames(group.use2))) {
        if (colname == i) {
          colid = paste0('Identity (', colname, ')')
        } else {
          colid = colname
        }

        # Default
        cols[[colname]] <- c(scales::hue_pal()(length(x = levels(x = group.use[[colname]]))))  

        #Overwrite if better value is provided
        if (!is_null(cols.use[[colname]])) {
          req_length = length(x = levels(group.use))
          if (length(cols.use[[colname]]) < req_length){
            warning("Cannot use provided colors for ", colname, " since there aren't enough colors.")
          } else {
            if (!is_null(names(cols.use[[colname]]))) {
              if (all(levels(group.use[[colname]]) %in% names(cols.use[[colname]]))) {
                cols[[colname]] <- as.vector(cols.use[[colname]][levels(group.use[[colname]])])
              } else {
                warning("Cannot use provided colors for ", colname, " since all levels (", paste(levels(group.use[[colname]]), collapse=","), ") are not represented.")
              }
            } else {
              cols[[colname]] <- as.vector(cols.use[[colname]])[c(1:length(x = levels(x = group.use[[colname]])))]
            }
          }
        }

        # Add white if there's lines
        if (draw.lines) {
          levels(x = group.use2[[colname]]) <- c(levels(x = group.use2[[colname]]), na.group)  
          group.use2[placeholder.cells, colname] <- na.group
          cols[[colname]] <- c(cols[[colname]], "#FFFFFF")
        }
        names(x = cols[[colname]]) <- levels(x = group.use2[[colname]])

        y.range <- diff(x = pbuild$layout$panel_params[[1]]$y.range)
        y.pos <- max(pbuild$layout$panel_params[[1]]$y.range) + y.range * 0.015
        y.max <- y.pos + group.bar.height * y.range
        pbuild$layout$panel_params[[1]]$y.range <- c(pbuild$layout$panel_params[[1]]$y.range[1], y.max)

        plot <- suppressMessages(plot + 
                                   annotation_raster(raster = t(x = cols[[colname]][group.use2[[colname]]]),  xmin = -Inf, xmax = Inf, ymin = y.pos, ymax = y.max) + 
                                   annotation_custom(grob = grid::textGrob(label = colid, hjust = 0, gp = gpar(cex = 0.75)), ymin = mean(c(y.pos, y.max)), ymax = mean(c(y.pos, y.max)), xmin = Inf, xmax = Inf) +
                                   coord_cartesian(ylim = c(0, y.max), clip = "off")) 

        if ((colname == i) && label) {
          x.max <- max(pbuild$layout$panel_params[[1]]$x.range)
          x.divs <- pbuild$layout$panel_params[[1]]$x.major %||% pbuild$layout$panel_params[[1]]$x$break_positions()
          group.use$x <- x.divs
          label.x.pos <- tapply(X = group.use$x, INDEX = group.use[[colname]],
                                FUN = median) * x.max
          label.x.pos <- data.frame(group = names(x = label.x.pos), 
                                    label.x.pos)
          plot <- plot + geom_text(stat = "identity", 
                                   data = label.x.pos, aes_string(label = "group", 
                                                                  x = "label.x.pos"), y = y.max + y.max * 
                                     0.03 * 0.5, angle = angle, hjust = hjust, 
                                   size = size)
          plot <- suppressMessages(plot + coord_cartesian(ylim = c(0, 
                                                                   y.max + y.max * 0.002 * max(nchar(x = levels(x = group.use[[colname]]))) * 
                                                                     size), clip = "off"))
        }
      }
    }
    plot <- plot + theme(line = element_blank())
    plots[[i]] <- plot
  }
  if (combine) {
    plots <- CombinePlots(plots = plots)
  }
  return(plots)
}

###DEGs endothelial

Idents(str_endo) <- str_endo@meta.data$cell_type_str

endo_markers <- FindAllMarkers(str_endo, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)

top_endo_markers <- endo_markers %>% group_by(cluster) %>% top_n(10, wt = avg_logFC)

DoMultiBarHeatmap(str_endo, features = top_endo_markers$gene, group.by = "cell_type_str", additional.group.by = "tissue_type",additional.group.sort.by = "tissue_type", cols.use = list(tissue_type = use_colors), draw.lines = F) +
  scale_fill_viridis()
ggsave2("SuppFig6B.png", path = "../results", width = 30, height = 40, units = "cm")
#ggsave2("HeatMap_Endo.pdf", path = "output/fig3", width = 30, height = 40, units = "cm")
#ggsave2("HeatMap_Endo.emf", path = "output/fig3", width = 30, height = 40, units = "cm")

###DEGs fibroblastic

Idents(str_fibro) <- str_fibro@meta.data$cell_type_str

fibro_markers <- FindAllMarkers(str_fibro, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)

top_fibro_markers <- fibro_markers %>% group_by(cluster) %>% top_n(10, wt = avg_logFC)

DoMultiBarHeatmap(str_fibro, features = top_fibro_markers$gene, group.by = "cell_type_str", additional.group.by = "tissue_type",additional.group.sort.by = "tissue_type", cols.use = list(tissue_type = use_colors), draw.lines = F) +
  scale_fill_viridis()
ggsave2("Fig6C.png", path = "../results", width = 30, height = 40, units = "cm")
#ggsave2("HeatMap_Fibro.pdf", path = "output/fig3", width = 30, height = 40, units = "cm")
#ggsave2("HeatMap_Fibro.emf", path = "output/fig3", width = 30, height = 40, units = "cm")

###progeny scores

str_fibro2 <- subset(str_anno, subset = cell_type_str %in% c("Fibroblast1",
                                                             "Fibroblast2",
                                                             "Myofibroblast1",
                                                             "Myofibroblast2",
                                                             "Smoothmuscle1",
                                                             "Smoothmuscle2"))

progeny_scores <- as.data.frame(t(GetAssayData(str_fibro2, assay = "progeny", slot = "scale.data")))
progeny_scores$cell_id <- rownames(progeny_scores)
progeny_scores <- gather(progeny_scores, Pathway, Activity, -cell_id)

cells_clusters <- FetchData(str_anno, c("cell_type_str"))
cells_clusters$cell_id <- rownames(cells_clusters)

progeny_scores <- inner_join(progeny_scores, cells_clusters)

summarized_progeny_scores <- progeny_scores %>% 
  group_by(Pathway, cell_type_str) %>% 
  summarise(avg = mean(Activity), std = sd(Activity)) %>%
  pivot_wider(id_cols = Pathway, names_from = cell_type_str, values_from = avg) %>%
  column_to_rownames("Pathway") %>%
  as.matrix()

pdf("../results/Fig3D.pdf", width = 7, height = 10)
heatmap.2(summarized_progeny_scores, trace = "none", density.info = "none", col = bluered(100), margins = c(10,10))
dev.off()

#############################################immune analyses

### load libraries
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(ggplot2)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(stringr)
library(cowplot)
library(scales)
library(readr)
library(progeny)
library(gplots)
library(tibble)
library(grid)
library(rlang)

theme_set(theme_cowplot())

use_colors <- c(
  Tumor = "brown2",
  Normal = "deepskyblue2",
  G1 = "#46ACC8",
  G2M = "#E58601",
  S = "#B40F20",
  Epithelial = "seagreen",
  Immune = "darkgoldenrod2",
  Stromal = "steelblue",
  p018 = "#E2D200",
  p019 = "#46ACC8",
  p023 = "#E58601",
  p024 = "#B40F20",
  p027 = "#0B775E",
  p028 = "#E1BD6D",
  p029 = "#35274A",
  p030 = "#F2300F",
  p031 = "#7294D4",
  p032 = "#5B1A18",
  p033 = "#9C964A",
  p034 = "#FD6467",
  Alveolar_Macrophages1 = "#6bAEd6",
  Alveolar_Macrophages2 = "#3182BD",
  Alveolar_Macrophages3 = "#08519C",
  CD14_Macrophages1= "#fff500",
  CD14_Macrophages2= "#FE9929",
  CD14_Macrophages3= "#EC7014",
  CD14_Macrophages4= "#CC4C02",
  CD14_Macrophages5= "#8C2D04",
  Macrophages_Proliferating= "#E31A1C",
  Monocytes= "#FA9FB5",
  Myeloid_Dendritic= "#DD3497",
  Plasmacytoid_Dendritic= "#7A0177",
  T_conv1= "#c2e699",
  T_conv2= "#78c679",
  T_reg= "#006837",
  T_CD8_1= "#bcbddc",
  T_CD8_2= "#9e9ac8",
  T_CD8_3= "#807dba",
  T_CD8_Proliferating= "#6a51a3",
  NK_cells= "#4a1486",
  B_cells= "#969696",
  Plasma= "#636363",
  Mast= "#252525")

#load data

#imm_anno <- readRDS("seurat_objects/imm_anno.RDS")

imm_anno@meta.data$cell_type_imm <- ordered(imm_anno@meta.data$cell_type_imm, levels = c("Alveolar_Macrophages1",
                                                                                         "Alveolar_Macrophages2",
                                                                                         "Alveolar_Macrophages3",
                                                                                         "CD14_Macrophages1",
                                                                                         "CD14_Macrophages2",
                                                                                         "CD14_Macrophages3",
                                                                                         "CD14_Macrophages4",
                                                                                         "CD14_Macrophages5",
                                                                                         "Macrophages_Proliferating",
                                                                                         "Monocytes",
                                                                                         "Myeloid_Dendritic",
                                                                                         "Plasmacytoid_Dendritic",
                                                                                         "Mast",
                                                                                         "T_conv1",
                                                                                         "T_conv2",
                                                                                         "T_reg",
                                                                                         "T_CD8_1",
                                                                                         "T_CD8_2",
                                                                                         "T_CD8_3",
                                                                                         "T_CD8_Proliferating",
                                                                                         "NK_cells",
                                                                                         "B_cells",
                                                                                         "Plasma"))

DimPlot(imm_anno, group.by = "tissue_type", cols = use_colors)
#ggsave2("DimPlot_imm_Normal_Tumor.pdf", path = "output/fig4", width = 15, height = 15, units = "cm")
#ggsave2("DimPlot_imm_Normal_Tumor.png", path = "output/fig4", width = 35, height = 15, units = "cm")

DimPlot(imm_anno, group.by = "patient_id", cols = use_colors, pt.size = 0.5)
#ggsave2("DimPlot_imm_patients.pdf", path = "output/fig4", width = 30, height = 15, units = "cm")
ggsave2("SuppFig1C_imm_patients.png", path = "../results", width = 15, height = 15, units = "cm")

DimPlot(imm_anno, group.by = "cell_type_imm", split.by = "tissue_type", cols = use_colors, pt.size = 0.5)
#ggsave2("DimPlot_imm_celltype.pdf", path = "output/fig4", width = 35, height = 15, units = "cm")
ggsave2("Fig4A_umap.png", path = "../results", width = 35, height = 15, units = "cm")

DotPlot(imm_anno, features = c("CD68", "LYZ", "FABP4", "MARCO", "LGMN", "CSF1R", "CD14", "S100A12", "FCN1", "CD1C", "FCER1A", "LILRA4", "IL3RA", "KIT", "GATA2", "CD3E", "CD4", "FOXP3", "IL2RA", "CD8A", "NKG7", "KLRD1", "MS4A1", "CD79A", "JCHAIN", "IGKC", "MKI67"), group.by = "cell_type_imm") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip() + 
  scale_color_viridis()
ggsave2("Fig4B.pdf", path = "../results", width = 20, height = 20, units = "cm")

###subsetting

imm_lympho <- subset(imm_anno, subset = cell_type_imm %in% c("T_conv1",
                                                             "T_conv2",
                                                             "T_reg",
                                                             "T_CD8_1",
                                                             "T_CD8_2",
                                                             "T_CD8_3",
                                                             "T_CD8_Proliferating",
                                                             "NK_cells",
                                                             "B_cells",
                                                             "Plasma"))

imm_lympho <- ScaleData(imm_lympho)

imm_myelo <- subset(imm_anno, subset = cell_type_imm %in% c("Alveolar_Macrophages1",
                                                            "Alveolar_Macrophages2",
                                                            "Alveolar_Macrophages3",
                                                            "CD14_Macrophages1",
                                                            "CD14_Macrophages2",
                                                            "CD14_Macrophages3",
                                                            "CD14_Macrophages4",
                                                            "CD14_Macrophages5",
                                                            "Macrophages_Proliferating",
                                                            "Monocytes",
                                                            "Myeloid_Dendritic",
                                                            "Plasmacytoid_Dendritic",
                                                            "Mast"))

imm_myelo <- ScaleData(imm_myelo)

lympho_counts <- FetchData(imm_lympho, vars = c("tissue_type", "cell_type_imm", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal")))

lympho_counts_tbl <- lympho_counts %>%
  dplyr::count(cell_type_imm, patient_id, tissue_type)
write_csv(lympho_counts_tbl, path = "../results/SuppTable4.csv")

myelo_counts <- FetchData(imm_myelo, vars = c("tissue_type", "cell_type_imm", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal"))) 

myelo_counts_tbl <- myelo_counts %>%
  dplyr::count(cell_type_imm, patient_id, tissue_type)
write_csv(myelo_counts_tbl, path = "../results/SuppTable3.csv")

ggplot(data = lympho_counts, aes(x = tissue_type, fill = cell_type_imm)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig4A_barplot_lymphoid.pdf", path = "../results", width = 20, height = 5, units = "cm")

ggplot(data = myelo_counts, aes(x = tissue_type, fill = cell_type_imm)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig4A_barplot_myeloid.pdf", path = "../results", width = 20, height = 5, units = "cm")

lympho_counts %>%
  filter(tissue_type == "Tumor") %>%
  ggplot(aes(x = sample_id, fill = cell_type_imm)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig4A_barplot_lymphoid_per_patient.pdf", path = "../results", width = 30, height = 30, units = "cm")

myelo_counts %>%
  filter(tissue_type == "Tumor") %>%
  ggplot(aes(x = sample_id, fill = cell_type_imm)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("Fig4A_barplot_myeloid_per_patient.pdf", path = "../results", width = 30, height = 30, units = "cm")

lympho_counts %>%
  filter(tissue_type == "Normal") %>%
  ggplot(aes(x = sample_id, fill = cell_type_imm)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("SuppFig7A_lymphoid.pdf", path = "../results", width = 30, height = 30, units = "cm")

myelo_counts %>%
  filter(tissue_type == "Normal") %>%
  ggplot(aes(x = sample_id, fill = cell_type_imm)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = use_colors) +
  coord_flip() +
  scale_y_reverse()
ggsave2("SuppFig7A_myeloid.pdf", path = "../results", width = 30, height = 30, units = "cm")

###DEGs macrophages

imm_macro <- subset(imm_anno, subset = cell_type_imm %in% c("Alveolar_Macrophages1",
                                                            "Alveolar_Macrophages2",
                                                            "Alveolar_Macrophages3",
                                                            "CD14_Macrophages1",
                                                            "CD14_Macrophages2",
                                                            "CD14_Macrophages3",
                                                            "CD14_Macrophages4",
                                                            "CD14_Macrophages5",
                                                            "Macrophages_Proliferating"))

imm_macro <- ScaleData(imm_macro)

Idents(imm_macro) <- imm_macro@meta.data$cell_type_imm

macro_markers <- FindAllMarkers(imm_macro, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)

top_macro_markers <- macro_markers %>% group_by(cluster) %>% top_n(10, wt = avg_logFC)

DoMultiBarHeatmap(imm_macro, features = top_macro_markers$gene, group.by = "cell_type_imm", additional.group.by = "tissue_type",additional.group.sort.by = "tissue_type", cols.use = list(tissue_type = use_colors), draw.lines = F) +
  scale_fill_viridis()
#ggsave2("HeatMap_Macro.pdf", path = "output/fig4", width = 30, height = 40, units = "cm")
ggsave2("SuppFig7B.png", path = "../results", width = 30, height = 40, units = "cm")
#ggsave2("HeatMap_Macro.emf", path = "output/fig4", width = 30, height = 40, units = "cm")

###DEGs T cells

imm_T <- subset(imm_anno, subset = cell_type_imm %in% c("T_conv1",
                                                        "T_conv2",
                                                        "T_reg",
                                                        "T_CD8_1",
                                                        "T_CD8_2",
                                                        "T_CD8_3",
                                                        "T_CD8_Proliferating"))

imm_T <- ScaleData(imm_T)

Idents(imm_T) <- imm_T@meta.data$cell_type_imm

markers_T <- FindAllMarkers(imm_T, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)

top_markers_T <- markers_T %>% group_by(cluster) %>% top_n(10, wt = avg_logFC)

DoMultiBarHeatmap(imm_T, features = top_markers_T$gene, group.by = "cell_type_imm", additional.group.by = "tissue_type",additional.group.sort.by = "tissue_type", cols.use = list(tissue_type = use_colors), draw.lines = F) +
  scale_fill_viridis()
#ggsave2("HeatMap_T.pdf", path = "output/fig4", width = 30, height = 35, units = "cm")
ggsave2("SuppFig7C.png", path = "../results", width = 30, height = 35, units = "cm")
#ggsave2("HeatMap_T.emf", path = "output/fig4", width = 30, height = 35, units = "cm")

###selected hallmark signatures and M1vsM2 signatures

m1m2_pws <- read_lines("../data/data/CLASSICAL_M1_VS_ALTERNATIVE_M2_MACROPHAGE_UP.gmt") %>%
  lapply(str_split, "\\t") %>% 
  unlist(recursive = F) %>% 
  lapply(function(x) setNames(list(x[-c(1:2)]), x[1])) %>% 
  unlist(recursive = F)

m1m2_pws <- append(m1m2_pws, read_lines("../data/data/CLASSICAL_M1_VS_ALTERNATIVE_M2_MACROPHAGE_DN.gmt") %>%
                     lapply(str_split, "\\t") %>% 
                     unlist(recursive = F) %>% 
                     lapply(function(x) setNames(list(x[-c(1:2)]), x[1])) %>% 
                     unlist(recursive = F))

imm_anno <- AddModuleScore(object = imm_anno, features = m1m2_pws, name = c("m1up", "m1dn"), nbin = 12)

VlnPlot(imm_anno, features = c("HALLMARK_INFLAMMATORY_RESPONSE31",
                               "HALLMARK_ALLOGRAFT_REJECTION46",
                               "HALLMARK_INTERFERON_GAMMA_RESPONSE19",
                               "HALLMARK_TNFA_SIGNALING_VIA_NFKB1",
                               "m1up1", 
                               "m1dn2"), 
        group.by = "cell_type_imm", pt.size = 0, ncol = 3, idents = c("Alveolar_Macrophages1",
                                                                      "Alveolar_Macrophages2",
                                                                      "Alveolar_Macrophages3",
                                                                      "CD14_Macrophages1",
                                                                      "CD14_Macrophages2",
                                                                      "CD14_Macrophages3",
                                                                      "CD14_Macrophages4",
                                                                      "CD14_Macrophages5",
                                                                      "Macrophages_Proliferating"), cols = use_colors)
ggsave2("Fig4C.pdf", path = "../results", width = 30, height = 20, units = "cm")

###T cell signatures

T_exhausted <- read_excel("../data/data/CD8_T_cells_exhausted.xlsx", skip = 1)
cytotoxicity <- c("PRF1", "IFNG", "GNLY", "NKG7", "GZMB", "GZMA", "GZMH", "KLRK1", "KLRB1", "KLRD1", "CTSW", "CST7")

T_cell_markers <- list(T_exhausted$GeneSymbol, cytotoxicity)

imm_T <- AddModuleScore(imm_T, features = T_cell_markers, name = c("exhaustion", "cytotoxicity"), nbin = 12)

VlnPlot(imm_T, features = c("cytotoxicity2", "exhaustion1"), pt.size = 0, group.by = "cell_type_imm", cols = use_colors, idents = c("T_CD8_1", "T_CD8_2", "T_CD8_3", "T_CD8_Proliferating"), ncol = 1)
ggsave2("Fig4F.pdf", path = "../results", width = 10, height = 20, units = "cm")

#############################################correlation cell types

### load libraries
library(ggplot2)
library(Seurat)
library(dplyr)
library(tidyr)
library(cowplot)
library(gplots)
library(RColorBrewer)
library(plotrix)
library(corrplot)
library(pcaMethods)
library(readr)

theme_set(theme_cowplot())

#color scheme
use_colors <- c(
  Tumor = "brown2",
  Normal = "deepskyblue2",
  G1 = "#46ACC8",
  G2M = "#E58601",
  S = "#B40F20",
  Epithelial = "seagreen",
  Immune = "darkgoldenrod2",
  Stromal = "steelblue",
  p018 = "#E2D200",
  p019 = "#46ACC8",
  p023 = "#E58601",
  p024 = "#B40F20",
  p027 = "#0B775E",
  p028 = "#E1BD6D",
  p029 = "#35274A",
  p030 = "#F2300F",
  p031 = "#7294D4",
  p032 = "#5B1A18",
  p033 = "#9C964A",
  p034 = "#FD6467",
  lepidic = "#0B775E",
  acinar = "#74A089",
  mucinuous (papillary) = "#E2D200",
  (micro)papillary = "#CEAB07",
  solid = "#B40F20",
  sarcomatoid = "#5B1A18")

###load data and subsetting

#epi_anno <- readRDS("seurat_objects/epi_anno.RDS")

epi_tumor <- SubsetData(SubsetData(epi_anno, subset.name = "cluster_type", accept.value = "Tumor"), subset.name = "tissue_type", accept.value = "Tumor")
epi_tumor <- ScaleData(epi_tumor)

epi_pca <- epi_tumor
epi_pca <- RunPCA(epi_pca)

#imm_anno <- readRDS("seurat_objects/imm_anno.RDS")

imm_anno@meta.data$cell_type_imm <- ordered(imm_anno@meta.data$cell_type_imm, levels = c("Alveolar_Macrophages1",
                                                                                         "Alveolar_Macrophages2",
                                                                                         "Alveolar_Macrophages3",
                                                                                         "CD14_Macrophages1",
                                                                                         "CD14_Macrophages2",
                                                                                         "CD14_Macrophages3",
                                                                                         "CD14_Macrophages4",
                                                                                         "CD14_Macrophages5",
                                                                                         "Macrophages_Proliferating",
                                                                                         "Monocytes",
                                                                                         "Myeloid_Dendritic",
                                                                                         "Plasmacytoid_Dendritic",
                                                                                         "Mast",
                                                                                         "T_conv1",
                                                                                         "T_conv2",
                                                                                         "T_reg",
                                                                                         "T_CD8_1",
                                                                                         "T_CD8_2",
                                                                                         "T_CD8_3",
                                                                                         "T_CD8_Proliferating",
                                                                                         "NK_cells",
                                                                                         "B_cells",
                                                                                         "Plasma"))

imm_lympho <- subset(imm_anno, subset = cell_type_imm %in% c("T_conv1",
                                                             "T_conv2",
                                                             "T_reg",
                                                             "T_CD8_1",
                                                             "T_CD8_2",
                                                             "T_CD8_3",
                                                             "T_CD8_Proliferating",
                                                             "NK_cells",
                                                             "B_cells",
                                                             "Plasma"))

imm_myelo <- subset(imm_anno, subset = cell_type_imm %in% c("Alveolar_Macrophages1",
                                                            "Alveolar_Macrophages2",
                                                            "Alveolar_Macrophages3",
                                                            "CD14_Macrophages1",
                                                            "CD14_Macrophages2",
                                                            "CD14_Macrophages3",
                                                            "CD14_Macrophages4",
                                                            "CD14_Macrophages5",
                                                            "Macrophages_Proliferating",
                                                            "Monocytes",
                                                            "Myeloid_Dendritic",
                                                            "Plasmacytoid_Dendritic",
                                                            "Mast"))

lympho_counts <- FetchData(imm_lympho, vars = c("tissue_type", "cell_type_imm", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal")))

myelo_counts <- FetchData(imm_myelo, vars = c("tissue_type", "cell_type_imm", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal"))) 

#str_anno <- readRDS("seurat_objects/str_anno.RDS")

str_anno@meta.data$cell_type_str <- factor(str_anno@meta.data$cell_type_str, levels = c("Endothelial1",
                                                                                        "Endothelial2",
                                                                                        "Endothelial3",
                                                                                        "Endothelial4",
                                                                                        "Endothelial5",
                                                                                        "Endothelial6",
                                                                                        "Endothelial7",
                                                                                        "Lymphaticendothelial",
                                                                                        "Fibroblast1",
                                                                                        "Fibroblast2",
                                                                                        "Myofibroblast1",
                                                                                        "Myofibroblast2",
                                                                                        "Smoothmuscle1",
                                                                                        "Smoothmuscle2",
                                                                                        "Mesothelial"))

str_endo <- subset(str_anno, subset = cell_type_str %in% c("Endothelial1",
                                                           "Endothelial2",
                                                           "Endothelial3",
                                                           "Endothelial4",
                                                           "Endothelial5",
                                                           "Endothelial6",
                                                           "Endothelial7",
                                                           "Lymphaticendothelial"))

str_fibro <- subset(str_anno, subset = cell_type_str %in% c("Fibroblast1",
                                                            "Fibroblast2",
                                                            "Myofibroblast1",
                                                            "Myofibroblast2",
                                                            "Smoothmuscle1",
                                                            "Smoothmuscle2",
                                                            "Mesothelial"))

endo_counts <- FetchData(str_endo, vars = c("tissue_type", "cell_type_str", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal")))

fibro_counts <- FetchData(str_fibro, vars = c("tissue_type", "cell_type_str", "sample_id", "patient_id")) %>%  
  mutate(tissue_type = factor(tissue_type, levels = c("Tumor", "Normal"))) 

###count immune and stromal cells

myelo_counts_rel <- myelo_counts %>%
  filter(tissue_type == "Tumor") %>%
  dplyr::count(cell_type_imm, patient_id) %>%
  group_by(patient_id) %>% 
  mutate(n_rel = n/sum(n))

myelo_counts_rel <- myelo_counts_rel %>%
  pivot_wider(id_cols = patient_id, names_from = cell_type_imm, values_from = n_rel)

lympho_counts_rel <- lympho_counts %>%
  filter(tissue_type == "Tumor") %>%
  dplyr::count(cell_type_imm, patient_id) %>%
  group_by(patient_id) %>% 
  mutate(n_rel = n/sum(n)) 

lympho_counts_rel <- lympho_counts_rel %>%
  pivot_wider(id_cols = patient_id, names_from = cell_type_imm, values_from = n_rel)

fibro_counts_rel <- fibro_counts %>%
  filter(tissue_type == "Tumor") %>%
  dplyr::count(cell_type_str, patient_id) %>%
  group_by(patient_id) %>% 
  mutate(n_rel = n/sum(n))

fibro_counts_rel <- fibro_counts_rel %>%
  pivot_wider(id_cols = patient_id, names_from = cell_type_str, values_from = n_rel)

endo_counts_rel <- endo_counts %>%
  filter(tissue_type == "Tumor") %>%
  dplyr::count(cell_type_str, patient_id) %>%
  group_by(patient_id) %>% 
  mutate(n_rel = n/sum(n))

endo_counts_rel <- endo_counts_rel %>%
  pivot_wider(id_cols = patient_id, names_from = cell_type_str, values_from = n_rel)

cell_counts_rel <- full_join(myelo_counts_rel, lympho_counts_rel, by = "patient_id")
cell_counts_rel <- full_join(cell_counts_rel, endo_counts_rel, by = "patient_id")
cell_counts_rel <- full_join(cell_counts_rel, fibro_counts_rel, by = "patient_id")

cell_counts_rel[is.na(cell_counts_rel)] <- 0

###cell proportion statistics

test <- cell_counts_rel %>%
  mutate(pattern = ifelse(patient_id %in% c("p032", "p018", "p019", "p024", "p031", "p033"), "N3MC", "CP2E")) %>%
  mutate(T_CD8_exhausted = T_CD8_1 + T_CD8_2)

#for Kim et al. dataset
#test <- cell_counts_rel %>%
#  mutate(pattern = ifelse(patient_id %in% c("P0009", "P0018", "P0020", "P0028"), "CP2E", "N3MC")) %>%
#  mutate(T_CD8_exhausted = T_CD8_1 + T_CD8_2)

ggplot(test, aes(x = pattern, y = CD14_Macrophages1)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = CD14_Macrophages1~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_CD14_Macrophages1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = CD14_Macrophages2)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = CD14_Macrophages2~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_CD14_Macrophages2.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = Myeloid_Dendritic)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = Myeloid_Dendritic~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_Myeloid_Dendritic.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = Plasmacytoid_Dendritic)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = Plasmacytoid_Dendritic~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_Plasmacytoid_Dendritic.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = Myofibroblast1)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = Myofibroblast1~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_Myofibroblast1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = Myofibroblast2)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = Myofibroblast2~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_Myofibroblast2.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = T_conv1)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = T_conv1~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_T_conv1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = NK_cells)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = NK_cells~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_NK_cells.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(test, aes(x = pattern, y = T_CD8_exhausted)) +
  geom_boxplot() +
  ggtitle(paste0("p = ", wilcox.test(formula = T_CD8_exhausted~pattern, data = test, alternative = "two.sided", paired = F)$p.value))
ggsave2("SuppFig8A_Group1vs2_T_CD8_exhausted.pdf", path = "../results", width = 10, height = 10, units = "cm")

###select only cell types that occured in at least three patients

celltype_selected <- cell_counts_rel
celltype_selected[celltype_selected == 0] <- NA
celltype_selected <- cell_counts_rel[, which(colMeans(!is.na(celltype_selected)) >= 0.3)]

cell_counts_rel_selected <- cell_counts_rel[, colnames(cell_counts_rel) %in% colnames(celltype_selected)]

cell_counts_rel_mtrx <- as.matrix(cell_counts_rel_selected[,-1])
rownames(cell_counts_rel_mtrx) <- cell_counts_rel_selected$patient_id

###principal component analysis

cell_counts_pca <- as.data.frame(scores(pca(cell_counts_rel_mtrx, nPcs = 10)))
cell_counts_pca$patient_id <- rownames(cell_counts_pca)

histo_data <- unique(FetchData(epi_tumor, c("patient_id", "histo_subtype")))
cell_counts_pca <- full_join(cell_counts_pca, histo_data, by = "patient_id")

ggplot(cell_counts_pca, aes(x = PC1, y = PC2, color = histo_subtype, label = patient_id)) +
  geom_point(size = 2) +
  geom_text(aes(label = patient_id), color = "black", vjust = 1.5, size = 2) +
  scale_color_manual(values = use_colors) +
  scale_x_continuous(limits = c(-1, 1.2)) +
  scale_y_continuous(limits = c(-1.2, 0.8))
ggsave2("Fig5A.pdf", path = "../results", width = 15, height = 9, units = "cm")

###add mean tumor cell signature score

epi_pca <- AddModuleScore(epi_pca, features = list(c("SCGB3A1", "PIGR", "NAPSA", "C4BPA", "SCGB3A2", "HLA-DRA", "CD74", "ADGRF5", "C16orf89", "FOLR1", "SELENBP1", "HLA-DRB1", "ID4", "MGP", "AQP3", "CA2", "LHX9", "HLA-DPB1", "FMO5", "GKN2", "C5", "MUC1", "NPC2", "RNASE1", "PIK3C2G", "SFTA2", "SLC34A2", "HLA-DPA1", "FGFR3", "PGC")), name = "tumor_signature")
epi_pca <- AddModuleScore(epi_pca, features = list(c("DSG2", "CAMK2N1", "FAM3C", "KRT7", "IFI27", "SLC2A1", "MARCKS", "PLAU", "AHNAK2", "PERP", "S100A4", "KRT19", "COL6A1", "UACA", "COL17A1", "CDA", "TPM2", "S100A16", "KRT8", "PRSS23", "DST", "LAMC2", "S100P", "PRSS3", "LAMA3", "DSP", "ITGA3", "MDK", "FAM83A", "ITGB4")), name = "anti_tumor_signature")

epi_tumor_data <- FetchData(epi_pca, c(colnames(epi_pca@meta.data)))

epi_tumor_signature <- epi_tumor_data %>%
  group_by(patient_id) %>% 
  mutate(diff_mean = mean(tumor_signature1)) %>%
  mutate(undiff_mean = mean(anti_tumor_signature1)) %>%
  dplyr::select(patient_id, diff_mean, undiff_mean) %>%
  unique()

###heatmap of normalized immune and stromal cell counts, and mean tumor cell signatures

cell_counts_rel_heatmap <- full_join(cell_counts_rel_selected, cell_counts_pca, by = "patient_id")
cell_counts_rel_heatmap <- full_join(cell_counts_rel_heatmap, epi_tumor_signature, by = "patient_id")

cell_counts_rel_heatmap_ordered <- cell_counts_rel_heatmap[order(cell_counts_rel_heatmap$PC1),]

cell_counts_rel_heatmap1 <- cell_counts_rel_heatmap_ordered[,2:ncol(celltype_selected)]
cell_counts_rel_heatmap2 <- cell_counts_rel_heatmap_ordered[,c("diff_mean", "undiff_mean")]

rownames(cell_counts_rel_heatmap1) <- cell_counts_rel_heatmap_ordered$patient_id
rownames(cell_counts_rel_heatmap2) <- cell_counts_rel_heatmap_ordered$patient_id

cell_counts_rel_heatmap1 <- scale(cell_counts_rel_heatmap1)

pdf("../results/Fig5B_immune_and_stromal_cells.pdf", width = 7, height = 7)
heatmap.2(as.matrix(t(cell_counts_rel_heatmap1)), trace = "none", density.info = "none", scale = "none", symbreaks = F, col = hcl.colors(100, palette = "orrd", rev = T), Rowv = T, Colv = F, dendrogram = "row", margins = c(5,10))
dev.off()

pdf("../results/Fig5B_tumor_cell_signatures.pdf", width = 7, height = 5)
heatmap.2(as.matrix(t(cell_counts_rel_heatmap2)), trace = "none", density.info = "none", scale = "none", col = bluered(100), Rowv = F, Colv = F, dendrogram = "none")
dev.off()

###correlation plots

# matrix of the p-value of the correlation (from http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram)
cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], method = "spearman", ...)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

p.mat <- cor.mtest(cell_counts_rel_mtrx)

test <- cor(cell_counts_rel_mtrx, method = "spearman")

png("../results/SuppFig8A.png", width = 30, height = 30, units = "cm", res = 600)
corrplot(test, p.mat = p.mat, sig.level = 0.05, insig = "blank", type = "full", tl.col = "black")
dev.off()

#pdf("output/fig5/corrplot.pdf", width = 7, height = 7)
#corrplot(test, p.mat = p.mat, sig.level = 0.05, insig = "blank", type = "full", tl.col = "black")
#dev.off()

### correlation network plot

test_net <- test

for (i in 1:nrow(test_net)){
  for (j in 1:ncol(test_net)){
    if(p.mat[i,j] > 0.05){
      test_net[i,j] <- 0
    }
    if(test_net[i,j] == 1){
      test_net[i,j] <- 0
    }
  }
}

library(qgraph)

pdf("../results/Fig5C.pdf", width = 6, height = 6)
qgraph(test_net, layout = "spring", threshold = 0.7, labels = colnames(test_net), label.scale = F, label.cex = 0.7, theme = "colorblind", vsize = 3, color = "grey", borders = F)
dev.off()

#####scatter plots

cell_counts_rel <- mutate(cell_counts_rel, T_CD8_exhausted = T_CD8_1+T_CD8_2)

###plots stromal

ggplot(cell_counts_rel, aes(x = Myofibroblast1, y = Myofibroblast2)) +
  geom_point(aes(color = patient_id), size = 4) +
  scale_x_continuous(limits = c(0,0.8)) +
  scale_y_continuous(limits = c(0,0.8)) +
  scale_color_manual(values = use_colors) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(cell_counts_rel$Myofibroblast1, cell_counts_rel$Myofibroblast2, method = "spearman")$p.value, 
                 "\nr = ",cor.test(cell_counts_rel$Myofibroblast1, cell_counts_rel$Myofibroblast2, method = "spearman")$estimate))
ggsave2("Fig3E.pdf", path = "../results", width = 10, height = 9, units = "cm")

###plots immune

ggplot(cell_counts_rel, aes(x = CD14_Macrophages1, y = CD14_Macrophages2)) +
  geom_point(aes(color = patient_id), size = 4) +
  scale_x_continuous(limits = c(0,0.6)) +
  scale_y_continuous(limits = c(0,0.8)) +
  scale_color_manual(values = use_colors) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(cell_counts_rel$CD14_Macrophages1, cell_counts_rel$CD14_Macrophages2, method = "spearman")$p.value, 
                 "\nr = ",cor.test(cell_counts_rel$CD14_Macrophages1, cell_counts_rel$CD14_Macrophages2, method = "spearman")$estimate))
ggsave2("Fig4D_Macrophages1_2.pdf", path = "../results", width = 10, height = 9, units = "cm")

ggplot(cell_counts_rel, aes(x = CD14_Macrophages2, y = Plasmacytoid_Dendritic)) +
  geom_point(aes(color = patient_id), size = 4) +
  scale_x_continuous(limits = c(0,0.8)) +
  scale_y_continuous(limits = c(0,0.06)) +
  scale_color_manual(values = use_colors) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(cell_counts_rel$CD14_Macrophages2, cell_counts_rel$Plasmacytoid_Dendritic, method = "spearman")$p.value, 
                 "\nr = ",cor.test(cell_counts_rel$CD14_Macrophages2, cell_counts_rel$Plasmacytoid_Dendritic, method = "spearman")$estimate))
ggsave2("Fig4D_Macrophages2_pDC.pdf", path = "../results", width = 10, height = 9, units = "cm")

ggplot(cell_counts_rel, aes(x = T_reg, y = T_CD8_exhausted)) +
  geom_point(aes(color = patient_id), size = 4) +
  scale_x_continuous(limits = c(0,0.5)) +
  scale_y_continuous(limits = c(0,0.85)) +
  scale_color_manual(values = use_colors) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(cell_counts_rel$T_reg, cell_counts_rel$T_CD8_exhausted, method = "spearman")$p.value, 
                 "\nr = ",cor.test(cell_counts_rel$T_reg, cell_counts_rel$T_CD8_exhausted, method = "spearman")$estimate))
ggsave2("Fig4G_T_reg_exhausted.pdf", path = "../results", width = 10, height = 9, units = "cm")

ggplot(cell_counts_rel, aes(x = CD14_Macrophages2, y = T_CD8_exhausted)) +
  geom_point(aes(color = patient_id), size = 4) +
  scale_x_continuous(limits = c(0,0.8)) +
  scale_y_continuous(limits = c(0,0.85)) +
  scale_color_manual(values = use_colors) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(cell_counts_rel$CD14_Macrophages2, cell_counts_rel$T_CD8_exhausted, method = "spearman")$p.value, 
                 "\nr = ",cor.test(cell_counts_rel$CD14_Macrophages2, cell_counts_rel$T_CD8_exhausted, method = "spearman")$estimate))
ggsave2("Fig4G_Macrophages_exhausted.pdf", path = "../results", width = 10, height = 9, units = "cm")

ggplot(cell_counts_rel, aes(x = Plasmacytoid_Dendritic, y = T_CD8_exhausted)) +
  geom_point(aes(color = patient_id), size = 4) +
  scale_x_continuous(limits = c(0,0.06)) +
  scale_y_continuous(limits = c(0,0.85)) +
  scale_color_manual(values = use_colors) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(cell_counts_rel$Plasmacytoid_Dendritic, cell_counts_rel$T_CD8_exhausted, method = "spearman")$p.value, 
                 "\nr = ",cor.test(cell_counts_rel$Plasmacytoid_Dendritic, cell_counts_rel$T_CD8_exhausted, method = "spearman")$estimate))
ggsave2("Fig4G_pDC_exhausted.pdf", path = "../results", width = 10, height = 9, units = "cm")

###compare cell counts with IHC quantification

CD123 <- read_excel("../data/IHC_quantification/Quantification_CD123.xlsx")
CD123_scrnaseq <- cell_counts_rel %>% select(Plasmacytoid_Dendritic)
CD123 <- full_join(CD123, CD123_scrnaseq, by = "patient_id")
CD123 <- pivot_longer(CD123, cols = starts_with("image"), names_to = "image_nr", values_to = "count")
CD123_average <- CD123 %>% group_by(patient_id) %>% mutate(mean = mean(count)) %>% mutate(sd = sd(count)) %>% select(c(patient_id, Plasmacytoid_Dendritic, mean, sd)) %>% unique()
ggplot() +
  geom_jitter(data = CD123, mapping = aes(x = Plasmacytoid_Dendritic, y = count, color = patient_id), size = 0.5, alpha = 0.3) +
  geom_smooth(data = CD123_average, mapping = aes(x = Plasmacytoid_Dendritic, y = mean, color = patient_id), method = "lm", se = FALSE, size = 1, color = "black") +
  geom_errorbar(data = CD123_average, aes(x = Plasmacytoid_Dendritic, ymin=mean-sd, ymax=mean+sd, color = patient_id), size = 0.5, width = 0.002) +
  geom_point(data = CD123_average, aes(x = Plasmacytoid_Dendritic, y=mean, color = patient_id), shape=15, size = 3) +
  scale_color_manual(values = use_colors) +
  ggtitle(paste0("p = ",cor.test(CD123_average$mean, CD123_average$Plasmacytoid_Dendritic, method = "pearson")$p.value, 
                 "\nr = ",cor.test(CD123_average$mean, CD123_average$Plasmacytoid_Dendritic, method = "pearson")$estimate))
ggsave2("Fig4E_pDC_CD123.pdf", path = "../results", width = 10, height = 9, units = "cm")

CTHRC1 <- read_excel("../data/IHC_quantification/Quantification_CTHRC1.xlsx")
CTHRC1_scrnaseq <- cell_counts_rel %>% select(Myofibroblast2)
CTHRC1 <- full_join(CTHRC1, CTHRC1_scrnaseq, by = "patient_id")
CTHRC1 <- pivot_longer(CTHRC1, cols = starts_with("image"), names_to = "image_nr", values_to = "count")
CTHRC1_average <- CTHRC1 %>% group_by(patient_id) %>% mutate(mean = mean(count)) %>% mutate(sd = sd(count)) %>% select(c(patient_id, Myofibroblast2, mean, sd)) %>% unique()
ggplot() +
  geom_jitter(data = CTHRC1, mapping = aes(x = Myofibroblast2, y = count, color = patient_id), size = 0.5, alpha = 0.3) +
  geom_smooth(data = CTHRC1_average, mapping = aes(x = Myofibroblast2, y = mean, color = patient_id), method = "lm", se = FALSE, size = 1, color = "black") +
  geom_errorbar(data = CTHRC1_average, aes(x = Myofibroblast2, ymin=mean-sd, ymax=mean+sd, color = patient_id), size = 0.5) +
  geom_point(data = CTHRC1_average, aes(x = Myofibroblast2, y=mean, color = patient_id), shape=15, size = 3) +
  scale_color_manual(values = use_colors) +
  ggtitle(paste0("p = ",cor.test(CTHRC1_average$mean, CTHRC1_average$Myofibroblast2, method = "pearson")$p.value, 
                 "\nr = ",cor.test(CTHRC1_average$mean, CTHRC1_average$Myofibroblast2, method = "pearson")$estimate))
ggsave2("Fig3F_Myofibroblast2_CTHRC1.pdf", path = "../results", width = 10, height = 9, units = "cm")

CXCL9 <- read_excel("../data/IHC_quantification/Quantification_CXCL9.xlsx")
CXCL9_scrnaseq <- cell_counts_rel %>% select(CD14_Macrophages2)
CXCL9 <- full_join(CXCL9, CXCL9_scrnaseq, by = "patient_id")
CXCL9 <- pivot_longer(CXCL9, cols = starts_with("image"), names_to = "image_nr", values_to = "count")
CXCL9_average <- CXCL9 %>% group_by(patient_id) %>% mutate(mean = mean(count)) %>% mutate(sd = sd(count)) %>% select(c(patient_id, CD14_Macrophages2, mean, sd)) %>% unique()
ggplot() +
  geom_jitter(data = CXCL9, mapping = aes(x = CD14_Macrophages2, y = count, color = patient_id), size = 0.5, alpha = 0.3, width = 0.02) +
  geom_smooth(data = CXCL9_average, mapping = aes(x = CD14_Macrophages2, y = mean, color = patient_id), method = "lm", se = FALSE, size = 1, color = "black") +
  geom_errorbar(data = CXCL9_average, aes(x = CD14_Macrophages2, ymin=mean-sd, ymax=mean+sd, color = patient_id), size = 0.5, width = 0.03) +
  geom_point(data = CXCL9_average, aes(x = CD14_Macrophages2, y=mean, color = patient_id), shape=15, size = 3) +
  scale_color_manual(values = use_colors) +
  ggtitle(paste0("p = ",cor.test(CXCL9_average$mean, CXCL9_average$CD14_Macrophages2, method = "pearson")$p.value, 
                 "\nr = ",cor.test(CXCL9_average$mean, CXCL9_average$CD14_Macrophages2, method = "pearson")$estimate))
ggsave2("Fig4E_CD14_Macrophages_2_CXCL9.pdf", path = "../results", width = 10, height = 9, units = "cm")

########################################TCGA dataset

library(TCGAbiolinks)
library(GSVA)
library(SummarizedExperiment)
library(Seurat)
library(biomaRt)
library(dplyr)
library(tidyr)
library(ggplot2)
library(cowplot)
library(corrplot)
library(stringr)
library(survival)
library(survminer)
library(plotrix)
library(readr)
library(HDF5Array)

theme_set(theme_cowplot())

###load gene expression data

luad_se <- loadHDF5SummarizedExperiment(dir="../data/TCGA LUAD")
luad_mut <- read_csv("../data/data/TCGA_LUAD_mutations.csv")

###replace gene symbols, calculate log2-transformed matrix

luad_data <- as.data.frame(assay(luad_se))

luad_data$hgnc_symbol <- luad_se@rowRanges$external_gene_name

luad_data <- luad_data[!(duplicated(luad_data$hgnc_symbol)),]
rownames(luad_data) <- luad_data$hgnc_symbol

luad_data <- luad_data[,1:(ncol(luad_data)-1)]
luad_data <- as.matrix(luad_data)

log_luad_data <- log2(luad_data+1)

###calculate cell cluster marker genes

#imm_anno <- readRDS("seurat_objects/imm_anno.RDS")

imm_anno@meta.data$cell_type_imm <- ordered(imm_anno@meta.data$cell_type_imm, levels = c("Alveolar_Macrophages1",
                                                                                         "Alveolar_Macrophages2",
                                                                                         "Alveolar_Macrophages3",
                                                                                         "CD14_Macrophages1",
                                                                                         "CD14_Macrophages2",
                                                                                         "CD14_Macrophages3",
                                                                                         "CD14_Macrophages4",
                                                                                         "CD14_Macrophages5",
                                                                                         "Macrophages_Proliferating",
                                                                                         "Monocytes",
                                                                                         "Myeloid_Dendritic",
                                                                                         "Plasmacytoid_Dendritic",
                                                                                         "Mast",
                                                                                         "T_conv1",
                                                                                         "T_conv2",
                                                                                         "T_reg",
                                                                                         "T_CD8_1",
                                                                                         "T_CD8_2",
                                                                                         "T_CD8_3",
                                                                                         "T_CD8_Proliferating",
                                                                                         "NK_cells",
                                                                                         "B_cells",
                                                                                         "Plasma"))

imm_lympho <- subset(imm_anno, subset = cell_type_imm %in% c("T_conv1",
                                                             "T_conv2",
                                                             "T_reg",
                                                             "T_CD8_1",
                                                             "T_CD8_2",
                                                             "T_CD8_3",
                                                             "T_CD8_Proliferating",
                                                             "NK_cells",
                                                             "B_cells",
                                                             "Plasma"))
imm_lympho <- ScaleData(imm_lympho)

imm_myelo <- subset(imm_anno, subset = cell_type_imm %in% c("Alveolar_Macrophages1",
                                                            "Alveolar_Macrophages2",
                                                            "Alveolar_Macrophages3",
                                                            "CD14_Macrophages1",
                                                            "CD14_Macrophages2",
                                                            "CD14_Macrophages3",
                                                            "CD14_Macrophages4",
                                                            "CD14_Macrophages5",
                                                            "Macrophages_Proliferating",
                                                            "Monocytes",
                                                            "Myeloid_Dendritic",
                                                            "Plasmacytoid_Dendritic",
                                                            "Mast"))
imm_myelo <- ScaleData(imm_myelo)

#str_anno <- readRDS("seurat_objects/str_anno.RDS")

str_anno@meta.data$cell_type_str <- factor(str_anno@meta.data$cell_type_str, levels = c("Endothelial1",
                                                                                        "Endothelial2",
                                                                                        "Endothelial3",
                                                                                        "Endothelial4",
                                                                                        "Endothelial5",
                                                                                        "Endothelial6",
                                                                                        "Endothelial7",
                                                                                        "Lymphaticendothelial",
                                                                                        "Fibroblast1",
                                                                                        "Fibroblast2",
                                                                                        "Myofibroblast1",
                                                                                        "Myofibroblast2",
                                                                                        "Smoothmuscle1",
                                                                                        "Smoothmuscle2",
                                                                                        "Mesothelial"))

str_endo <- subset(str_anno, subset = cell_type_str %in% c("Endothelial1",
                                                           "Endothelial2",
                                                           "Endothelial3",
                                                           "Endothelial4",
                                                           "Endothelial5",
                                                           "Endothelial6",
                                                           "Endothelial7",
                                                           "Lymphaticendothelial"))
str_endo <- ScaleData(str_endo)

str_fibro <- subset(str_anno, subset = cell_type_str %in% c("Fibroblast1",
                                                            "Fibroblast2",
                                                            "Myofibroblast1",
                                                            "Myofibroblast2",
                                                            "Smoothmuscle1",
                                                            "Smoothmuscle2",
                                                            "Mesothelial"))
str_fibro <- ScaleData(str_fibro)

Idents(imm_myelo) <- imm_myelo@meta.data$cell_type_imm
Idents(imm_lympho) <- imm_lympho@meta.data$cell_type_imm
Idents(str_endo) <- str_endo@meta.data$cell_type_str
Idents(str_fibro) <- str_fibro@meta.data$cell_type_str

myelo_markers <- FindAllMarkers(imm_myelo, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)
write_csv(myelo_markers, path = "../results/SuppTable9.csv")

lympho_markers <- FindAllMarkers(imm_lympho, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)
write_csv(lympho_markers, path = "../results/SuppTable10.csv")

endo_markers <- FindAllMarkers(str_endo, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)
write_csv(endo_markers, path = "../results/SuppTable7.csv")

fibro_markers <- FindAllMarkers(str_fibro, only.pos = T, min.pct = 0.25, min.diff.pct = 0.25)
write_csv(fibro_markers, path = "../results/SuppTable8.csv")

clusters_myelo <- as.character(unique(myelo_markers$cluster))
clusters_lympho <- as.character(unique(lympho_markers$cluster))
clusters_endo <- as.character(unique(endo_markers$cluster))
clusters_fibro <- as.character(unique(fibro_markers$cluster))

markers <- list()

for (i in seq_along(clusters_myelo)){
  markers[paste0(clusters_myelo[i])] <- myelo_markers %>% filter(cluster == clusters_myelo[i]) %>% dplyr::select(gene)
}

for (i in seq_along(clusters_lympho)){
  markers[paste0(clusters_lympho[i])] <- lympho_markers %>% filter(cluster == clusters_lympho[i]) %>% dplyr::select(gene)
}

for (i in seq_along(clusters_endo)){
  markers[paste0(clusters_endo[i])] <- endo_markers %>% filter(cluster == clusters_endo[i]) %>% dplyr::select(gene)
}

for (i in seq_along(clusters_fibro)){
  markers[paste0(clusters_fibro[i])] <- fibro_markers %>% filter(cluster == clusters_fibro[i]) %>% dplyr::select(gene)
}

### genes from PCA of tumor epithelial cells

markers[["PC1_pos"]] <- c("SCGB3A1", "PIGR", "NAPSA", "C4BPA", "SCGB3A2", "HLA-DRA", "CD74", "ADGRF5", "C16orf89", "FOLR1", "SELENBP1", "HLA-DRB1", "ID4", "MGP", "AQP3", "CA2", "LHX9", "HLA-DPB1", "FMO5", "GKN2", "C5", "MUC1", "NPC2", "RNASE1", "PIK3C2G", "SFTA2", "SLC34A2", "HLA-DPA1", "FGFR3", "PGC")
markers[["PC1_neg"]] <- c("DSG2", "CAMK2N1", "FAM3C", "KRT7", "IFI27", "SLC2A1", "MARCKS", "PLAU", "AHNAK2", "PERP", "S100A4", "KRT19", "COL6A1", "UACA", "COL17A1", "CDA", "TPM2", "S100A16", "KRT8", "PRSS23", "DST", "LAMC2", "S100P", "PRSS3", "LAMA3", "DSP", "ITGA3", "MDK", "FAM83A", "ITGB4")

###short signatures

for (i in seq_along(clusters_myelo)){
  markers[paste0(clusters_myelo[i], "_short")] <- myelo_markers %>% filter(cluster == clusters_myelo[i]) %>% top_n(5, wt = avg_logFC) %>% dplyr::select(gene)
}

for (i in seq_along(clusters_lympho)){
  markers[paste0(clusters_lympho[i], "_short")] <- lympho_markers %>% filter(cluster == clusters_lympho[i]) %>% top_n(5, wt = avg_logFC)%>% dplyr::select(gene)
}

for (i in seq_along(clusters_endo)){
  markers[paste0(clusters_endo[i], "_short")] <- endo_markers %>% filter(cluster == clusters_endo[i]) %>% top_n(5, wt = avg_logFC)%>% dplyr::select(gene)
}

for (i in seq_along(clusters_fibro)){
  markers[paste0(clusters_fibro[i], "_short")] <- fibro_markers %>% filter(cluster == clusters_fibro[i]) %>% top_n(5, wt = avg_logFC)%>% dplyr::select(gene)
}

markers[["PC1_pos_short"]] <- c("SCGB3A1", "PIGR", "NAPSA", "C4BPA", "SCGB3A2")
markers[["PC1_neg_short"]] <- c("DSG2", "CAMK2N1", "FAM3C", "KRT7", "IFI27")

###complete pattern signatures

#long
markers[["N3MC_complete"]] <- unique(c(markers[["CD14_Macrophages1"]], markers[["Myofibroblast1"]], markers[["NK_cells"]], markers[["Myeloid_Dendritic"]], markers[["T_conv1"]], markers[["PC1_pos"]]))
markers[["CP2E_complete"]] <- unique(c(markers[["CD14_Macrophages2"]], markers[["Myofibroblast2"]], markers[["Plasmacytoid_Dendritic"]], markers[["T_CD8_1"]], markers[["T_CD8_2"]], markers[["PC1_neg"]]))

#short
markers[["N3MC_complete_short"]] <- unique(c(markers[["CD14_Macrophages1_short"]], markers[["Myofibroblast1_short"]], markers[["NK_cells_short"]], markers[["Myeloid_Dendritic_short"]], markers[["T_conv1_short"]], markers[["PC1_pos_short"]]))
markers[["CP2E_complete_short"]] <- unique(c(markers[["CD14_Macrophages2_short"]], markers[["Myofibroblast2_short"]], markers[["Plasmacytoid_Dendritic_short"]], markers[["T_CD8_1_short"]], markers[["T_CD8_2_short"]], markers[["PC1_neg_short"]]))

###simple pattern signatures

#long
markers[["N3MC_simple"]] <- unique(c(markers[["CD14_Macrophages1"]], markers[["Myofibroblast1"]]))
markers[["CP2E_simple"]] <- unique(c(markers[["Myofibroblast2"]], markers[["PC1_neg"]]))

#short
markers[["N3MC_simple_short"]] <- unique(c(markers[["CD14_Macrophages1_short"]], markers[["Myofibroblast1_short"]]))
markers[["CP2E_simple_short"]] <- unique(c(markers[["Myofibroblast2_short"]], markers[["PC1_neg_short"]]))

###save gene sets
write_csv(as_data_frame(markers["N3MC_complete"]), path = "../results/SuppTable11_1.csv")
write_csv(as_data_frame(markers["CP2E_complete"]), path = "../results/SuppTable11_2.csv")
write_csv(as_data_frame(markers["N3MC_simple_short"]), path = "../results/SuppTable11_3.csv")
write_csv(as_data_frame(markers["CP2E_simple_short"]), path = "../results/SuppTable11_4.csv")

###single-sample Gene set Enrichment Analysis (ssGSEA)

luad_gsva <- gsva(expr = log_luad_data, gset.idx.list = markers, method = "ssgsea", verbose = TRUE, kcdf = "Gaussian")

luad_gsva <- as.data.frame(t(luad_gsva))

###calculate ratios

luad_gsva <- mutate(luad_gsva, PC1_ratio = PC1_pos / PC1_neg)
luad_gsva <- mutate(luad_gsva, Myofibro_ratio = Myofibroblast1 / Myofibroblast2)
luad_gsva <- mutate(luad_gsva, Macro_ratio = CD14_Macrophages1 / CD14_Macrophages2)
luad_gsva <- mutate(luad_gsva, Pattern_complete_ratio = N3MC_complete / CP2E_complete)
luad_gsva <- mutate(luad_gsva, Pattern_simple_ratio = N3MC_simple / CP2E_simple)

luad_gsva <- mutate(luad_gsva, PC1_ratio_short = PC1_pos_short / PC1_neg_short)
luad_gsva <- mutate(luad_gsva, Myofibro_ratio_short = Myofibroblast1_short / Myofibroblast2_short)
luad_gsva <- mutate(luad_gsva, Macro_ratio_short = CD14_Macrophages1_short / CD14_Macrophages2_short)
luad_gsva <- mutate(luad_gsva, Pattern_complete_short_ratio = N3MC_complete_short / CP2E_complete_short)
luad_gsva <- mutate(luad_gsva, Pattern_simple_short_ratio = N3MC_simple_short / CP2E_simple_short)

###add follow-up data

luad_gsva$days_to_death <- luad_se$days_to_death
luad_gsva$vital_status <- luad_se$vital_status
luad_gsva$days_to_last_follow_up <- luad_se$days_to_last_follow_up

luad_gsva$overall_survival <- ifelse(luad_gsva$vital_status == "Dead", luad_gsva$days_to_death, luad_gsva$days_to_last_follow_up)

###binarize data

luad_gsva <- mutate(luad_gsva, vital_status_bin = ifelse(vital_status == "Alive", 0, 1))

ES_names <- colnames(luad_gsva[1:98])

for (i in seq_along(ES_names)) {
  luad_gsva[,paste0(ES_names[i], "_bin")] <- ifelse(luad_gsva[,ES_names[i]] > median(luad_gsva[,ES_names[i]]), 1, 0)
}

###correlation plot all clusters

# matrix of the p-value of the correlation (from http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram)
cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], method = "spearman", ...)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

p.mat <- cor.mtest(as.matrix(luad_gsva[,1:40]))

corr_tcga <- cor(as.matrix(luad_gsva[,1:40]), method = "spearman")

png(file = "../results/SuppFig8B.png", width = 30, height = 30, units = "cm", res = 600)
corrplot(corr_tcga, p.mat = p.mat, sig.level = 0.05, insig = "blank", type = "full", tl.col = "black")
dev.off()

#pdf(file = "output/fig5/corrplot_tcga.pdf", width = 7, height = 7)
#corrplot(corr_tcga, p.mat = p.mat, sig.level = 0.05, insig = "blank", type = "full", tl.col = "black")
#dev.off()

###correlation plot for selected cell clusters

### PC1 ~ Macrophages

ggplot(luad_gsva, aes(y = PC1_pos, x = CD14_Macrophages1)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_pos, luad_gsva$CD14_Macrophages1, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_pos, luad_gsva$CD14_Macrophages1, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_pos_CD14_Macrophages1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(luad_gsva, aes(y = PC1_pos, x = CD14_Macrophages2)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_pos, luad_gsva$CD14_Macrophages2, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_pos, luad_gsva$CD14_Macrophages2, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_pos_CD14_Macrophages2.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(luad_gsva, aes(y = PC1_neg, x = CD14_Macrophages1)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_neg, luad_gsva$CD14_Macrophages1, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_neg, luad_gsva$CD14_Macrophages1, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_neg_CD14_Macrophages1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(luad_gsva, aes(y = PC1_neg, x = CD14_Macrophages2)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_neg, luad_gsva$CD14_Macrophages2, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_neg, luad_gsva$CD14_Macrophages2, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_neg_CD14_Macrophages2.pdf", path = "../results", width = 10, height = 10, units = "cm")

### PC1 ~ Myofibroblasts

ggplot(luad_gsva, aes(y = PC1_pos, x = Myofibroblast1)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_pos, luad_gsva$Myofibroblast1, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_pos, luad_gsva$Myofibroblast1, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_pos_Myofibroblast1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(luad_gsva, aes(y = PC1_pos, x = Myofibroblast2)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_pos, luad_gsva$Myofibroblast2, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_pos, luad_gsva$Myofibroblast2, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_pos_Myofibroblast2.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(luad_gsva, aes(y = PC1_neg, x = Myofibroblast1)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_neg, luad_gsva$Myofibroblast1, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_neg, luad_gsva$Myofibroblast1, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_neg_Myofibroblast1.pdf", path = "../results", width = 10, height = 10, units = "cm")

ggplot(luad_gsva, aes(y = PC1_neg, x = Myofibroblast2)) +
  geom_point(color = "grey", size = 0.1) +
  geom_smooth(method = "lm", se = FALSE, size = 1, color = "black") +
  ggtitle(paste0("p = ",cor.test(luad_gsva$PC1_neg, luad_gsva$Myofibroblast2, method = "spearman")$p.value, 
                 "\nrho = ",cor.test(luad_gsva$PC1_neg, luad_gsva$Myofibroblast2, method = "spearman")$estimate))
ggsave2("Fig5E_PC1_neg_Myofibroblast2.pdf", path = "../results", width = 10, height = 10, units = "cm")

###Survival analysis

# Kaplan-Meyer and Log-rank

for (i in c(104:201)) {
  ggsurvplot(survfit(Surv(as.numeric(overall_survival/30.4375), vital_status_bin)~luad_gsva[[i]], data = luad_gsva), pval = T, pval.method = T, title = paste0(colnames(luad_gsva)[i]), xlim = c(0,120), break.x.by = 30, palette = c("Red", "Blue"))
  ggsave2(paste0("Fig5F_SuppFig11_", colnames(luad_gsva)[i],".pdf"), path = "../results", width = 8, height = 10, units = "cm")
}

# Cox regression

summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Macro_ratio_bin, data = luad_gsva)) 
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Myofibro_ratio_bin, data = luad_gsva))
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ PC1_ratio_bin, data = luad_gsva))
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Pattern_complete_ratio_bin, data = luad_gsva))
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Pattern_simple_ratio_bin, data = luad_gsva))
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Pattern_complete_short_ratio_bin, data = luad_gsva))
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Pattern_simple_short_ratio_bin, data = luad_gsva))
summary(coxph(Surv(as.numeric(overall_survival), vital_status_bin) ~ Macro_ratio_bin + Myofibro_ratio_bin + PC1_ratio_bin, data = luad_gsva))

###add mutation data

#genes from nNGMv2 Panel
mutations <- c("ALK", "BRAF", "CTNNB1", "EGFR", "ERBB2", "FGFR1", "FGFR2", "FGFR3", "FGFR4", "HRAS", "IDH1", "KEAP1", "KRAS", "MAP2K1", "MET", "NRAS", "NTRK1", "NTRK2", "NTRK3", "PIK3CA", "PTEN", "RET", "ROS1", "STK11", "TP53")

patients_mut <- list()

for (i in seq_along(mutations)){
  patients_mut[[i]] <- filter(luad_mut, Hugo_Symbol == mutations[i] & Variant_Classification != "Silent")$Tumor_Sample_Barcode %>% as.vector() %>% substr(1,12)
}

names(patients_mut) <- mutations

luad_gsva_mut <- luad_gsva

luad_gsva_mut[, mutations] <- 0
luad_gsva_mut$samples <- rownames(luad_gsva_mut) %>% substr(1,12)

luad_gsva_mut$Pattern_ratio_complete_bin <- ifelse(luad_gsva_mut$Pattern_complete_ratio_bin == 1, "N3MC_complete", "CP2E_complete")
luad_gsva_mut$Pattern_ratio_simple_short_bin <- ifelse(luad_gsva_mut$Pattern_simple_short_ratio_bin == 1, "N3MC_simple_short", "CP2E_simple_short")

luad_gsva_mut$TMB <- as.character(luad_se$paper_Nonsilent.Mutations.per.Mb) %>% gsub(pattern = ",", replacement = ".", fixed = T) %>% as.numeric()

for(i in seq_along(mutations)){
  luad_gsva_mut[, mutations[i]] <- ifelse(luad_gsva_mut$sample %in% patients_mut[[i]], "mut", "wt")
}

luad_gsva_mut <- filter(luad_gsva_mut, samples %in% substr(luad_mut$Tumor_Sample_Barcode, 1, 12))

#Pattern_ratio_bin

for(i in seq_along(mutations)){
  mut_counts <- luad_gsva_mut %>% group_by(Pattern_ratio_complete_bin) %>% dplyr::count(eval(parse(text = mutations[i]))) %>% pivot_wider(id_cols = Pattern_ratio_complete_bin, names_from = "eval(parse(text = mutations[i]))", values_from = n)

  mut_counts[is.na(mut_counts)] <- 0

  chi <- chisq.test(mut_counts[,2:3])

  ggplot(luad_gsva_mut) + 
    geom_bar(aes(x = Pattern_ratio_complete_bin, fill = eval(parse(text = mutations[i]))), position = "fill") +
    theme(legend.title = element_blank()) +
    ggtitle(paste0(mutations[i], " X虏 = ", chi$statistic, " p = ", chi$p.value))
  ggsave2(paste0("Fig5G_mut_", mutations[[i]], ".pdf"), path = "../results", width = 20, height = 10, units = "cm")
}

#for(i in seq_along(mutations)){
#  mut_counts <- luad_gsva_mut %>% group_by(Pattern_ratio_simple_short_bin) %>% dplyr::count(eval(parse(text = mutations[i]))) %>% pivot_wider(id_cols = Pattern_ratio_simple_short_bin, names_from = "eval(parse(text = mutations[i]))", values_from = n)
#  
#  mut_counts[is.na(mut_counts)] <- 0
#  
#  chi <- chisq.test(mut_counts[,2:3])
#  
#  ggplot(luad_gsva_mut) + 
#    geom_bar(aes(x = Pattern_ratio_simple_short_bin, fill = eval(parse(text = mutations[i]))), position = "fill") +
#    theme(legend.title = element_blank()) +
#    ggtitle(paste0(mutations[i], " X虏 = ", chi$statistic, " p = ", chi$p.value))
#  ggsave2(paste0("Fig5G_mut_simple_short_", mutations[[i]], ".pdf"), path = "../results", width = 20, height = 10, units = "cm")
#}

###Tumor mutational burden

ggplot(luad_gsva_mut) +
  geom_boxplot(aes(x = Pattern_ratio_complete_bin, y = TMB)) +
  ggtitle(paste0("p = ", t.test(formula = TMB~Pattern_ratio_complete_bin, data = luad_gsva_mut, alternative = "two.sided", paired = F)$p.value))
ggsave2("Fig5G_TMB.pdf", path = "../results", width = 20, height = 10, units = "cm")

#ggplot(luad_gsva_mut) +
#  geom_boxplot(aes(x = Pattern_ratio_simple_short_bin, y = TMB)) +
#  ggtitle(paste0("p = ", t.test(formula = TMB~Pattern_ratio_simple_short_bin, data = luad_gsva_mut, alternative = "two.sided", paired = F)$p.value))
#ggsave2("Fig5G_TMB_simple_short.pdf", path = "../results", width = 20, height = 10, units = "cm")

###add fusion data

fusions.data <- data.frame(luad_se$paper_Fusions, luad_se$barcode)
fusions.data$barcode <- fusions.data$luad_se.barcode

luad_gsva_mut$barcode <- rownames(luad_gsva_mut)

luad_gsva_fus <- inner_join(fusions.data, luad_gsva_mut, by = "barcode")

luad_gsva_fus$alk_fusions <- ifelse(grepl("ALK", luad_gsva_fus$luad_se.paper_Fusions) == T, "fus", "wt")
luad_gsva_fus$ret_fusions <- ifelse(grepl("RET", luad_gsva_fus$luad_se.paper_Fusions) == T, "fus", "wt")
luad_gsva_fus$ros_fusions <- ifelse(grepl("ROS", luad_gsva_fus$luad_se.paper_Fusions) == T, "fus", "wt")

fusions <- c("alk_fusions", "ret_fusions", "ros_fusions")

#Pattern_ratio_bin

for(i in seq_along(fusions)){
  fus_counts <- luad_gsva_fus %>% group_by(Pattern_ratio_complete_bin) %>% dplyr::count(eval(parse(text = fusions[i]))) %>% pivot_wider(id_cols = Pattern_ratio_complete_bin, names_from = "eval(parse(text = fusions[i]))", values_from = n)

  fus_counts[is.na(fus_counts)] <- 0

  chi <- chisq.test(fus_counts[,2:3])

  ggplot(luad_gsva_fus) + 
    geom_bar(aes(x = Pattern_ratio_complete_bin, fill = eval(parse(text = fusions[i]))), position = "fill") +
    theme(legend.title = element_blank()) +
    ggtitle(paste0(fusions[i], " X? = ", chi$statistic, " p = ", chi$p.value))
  ggsave2(paste0("Fig5G_fus_", fusions[[i]], ".pdf"), path = "../results", width = 20, height = 10, units = "cm")      
}

#for(i in seq_along(fusions)){
#  fus_counts <- luad_gsva_fus %>% group_by(Pattern_ratio_simple_short_bin) %>% dplyr::count(eval(parse(text = fusions[i]))) %>% pivot_wider(id_cols = Pattern_ratio_simple_short_bin, names_from = "eval(parse(text = fusions[i]))", values_from = n)
#  
#  fus_counts[is.na(fus_counts)] <- 0
#  
#  chi <- chisq.test(fus_counts[,2:3])
#  
#  ggplot(luad_gsva_fus) + 
#    geom_bar(aes(x = Pattern_ratio_simple_short_bin, fill = eval(parse(text = fusions[i]))), position = "fill") +
#    theme(legend.title = element_blank()) +
#    ggtitle(paste0(fusions[i], " X? = ", chi$statistic, " p = ", chi$p.value))
#  ggsave2(paste0("Fig5_fus_simple_short_", fusions[[i]], ".pdf"), path = "../results", width = 20, height = 10, units = "cm")      
#}

print(nrow(luad_gsva))
print(nrow(luad_gsva_mut))
luad_gsva_mut %>% filter(TMB != "NA") %>% nrow() %>% print()

Reference

https://codeocean.com/capsule/8321305/tree/v1
P. Bischoff et al., “Single-cell RNA sequencing reveals distinct tumor microenvironmental patterns in lung adenocarcinoma,” Oncogene 2021 4050, vol. 40, no. 50, pp. 6748–6758, Oct. 2021, doi: 10.1038/s41388-021-02054-3.

Leave a Reply

Your email address will not be published. Required fields are marked *