Now for the moment we've all been waiting for: how do our differential expression results line up with the figures published by the authors?
First, we need to read in the differential expression data, stored for transcript-level differential expression data in the file isoform_exp.diff
from cuffdiff
. You'll notice below that the data is stored at a Dropbox link. That's because it's too big for GitHub
. Feel free to change the path to your own file if you ran the differential expression steps yourself (congratulations, too!).
import pandas as pd
import numpy as np
ids = "SRR633516 SRR633540 SRR633541 SRR633542 SRR633543 SRR633549 SRR633548 SRR633547 SRR633546 SRR633544 SRR633545".split(" ")
qvals = ["q" + str(curr) for curr in list(range(1,12))]
labels_fixed = dict(zip(qvals,ids))
#diffexpn = pd.read_csv("../../bvcn-sample/data/isoform_exp.diff", sep = "\t").replace(to_replace=labels_fixed)
diffexpn = pd.read_csv("https://www.dropbox.com/s/kexe5j6d7f1oqrg/isoform_exp_28Apr.diff?dl=1", sep = "\t").replace(to_replace=labels_fixed)
#diffexpn = diffexpn.loc[diffexpn["status"] == "OK",:]
diffexpn
ids = "SRR633516 SRR633540 SRR633541 SRR633542 SRR633543 SRR633549 SRR633548 SRR633547 SRR633546 SRR633544 SRR633545".split(" ")
treatments = ["ck","ck","ck","ck","ck","cold","cold","cold","cold","cold","cold"]
treatments_fixed = dict(zip(ids,treatments))
diffexpn["treatment_1"] = diffexpn["sample_1"].replace(to_replace=treatments_fixed)
diffexpn["treatment_2"] = diffexpn["sample_2"].replace(to_replace=treatments_fixed)
list(diffexpn["log2(fold_change)"][diffexpn["log2(fold_change)"] > 0])
diffexpn.loc[diffexpn["gene_id"] == "XLOC_000001",:]
diffexpnunique = pd.DataFrame(diffexpn.groupby(["gene_id", "treatment_1", "treatment_2"], as_index=False)["value_1","value_2","log2(fold_change)"].mean())
diffexpnunique = diffexpnunique.loc[\
#(diffexpnunique["value_1"] > 0.1) & \
#(diffexpnunique["value_2"] > 0.1) & \
(diffexpnunique["gene_id"] != np.NaN) & \
(diffexpnunique["treatment_1"] == "ck") & \
(diffexpnunique["treatment_2"] == "cold"),:]
diffexpnunique = diffexpn.loc[\
#(diffexpnunique["value_1"] > 0.1) & \
#(diffexpnunique["value_2"] > 0.1) & \
(diffexpn["gene_id"] != np.NaN) & \
(diffexpn["treatment_1"] == "ck") & \
(diffexpn["treatment_2"] == "cold"),:]
list(sorted(diffexpnunique["log2(fold_change)"],reverse=False))
diffexpnunique.loc[diffexpnunique["log2(fold_change)"] > 0.1, :]
diffexpnunique
trtmt = "ck"
# Count unique genes with FPKM <= 1
lowval = 0.1
highval = 1
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
(diffexpnunique['value_1'] > lowval) & \
(diffexpnunique['value_1'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)
# Count unique genes with 1 < FPKM < 20
lowval = 1
highval = 20
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
midfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
(diffexpnunique['value_1'] > lowval) & \
(diffexpnunique['value_1'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)
# Count unique genes with FPKM > 20
lowval = 20
highval = 1000000
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
highfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
(diffexpnunique['value_1'] > lowval) & \
(diffexpnunique['value_1'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
## Now for cold genes
trtmt = "ck"
# Count unique genes with FPKM <= 1
lowval = -100000
highval = 1
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
(diffexpnunique['value_2'] > lowval) & \
(diffexpnunique['value_2'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)
# Count unique genes with 1 < FPKM < 20
lowval = 1
highval = 20
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
midfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
(diffexpnunique['value_2'] > lowval) & \
(diffexpnunique['value_2'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)
# Count unique genes with FPKM > 20
lowval = 20
highval = 1000000
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
highfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
(diffexpnunique['value_2'] > lowval) & \
(diffexpnunique['value_2'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
Now let's do the same thing for the gene-level differential expression data, stored in gene_exp.diff
.
diffexpngenedl = pd.read_csv("https://www.dropbox.com/s/kexe5j6d7f1oqrg/gene_exp_28Apr.diff?dl=1", sep = "\t").replace(to_replace=labels_fixed)
For this gene differential expression data, we need to include the correspondence between the accession numbers and the treatments so that we can filter out the treatments based on whether they were control or cold-treated.
ids = "SRR633516 SRR633540 SRR633541 SRR633542 SRR633543 SRR633549 SRR633548 SRR633547 SRR633546 SRR633544 SRR633545".split(" ")
treatments = ["ck","ck","ck","ck","ck","cold","cold","cold","cold","cold","cold"]
treatments_fixed = dict(zip(ids,treatments))
diffexpngene = diffexpngenedl.copy(deep=True)
diffexpngene["treatment_1"] = diffexpngene["sample_1"].replace(to_replace=treatments_fixed)
diffexpngene["treatment_2"] = diffexpngene["sample_2"].replace(to_replace=treatments_fixed)
diffexpngenefilt = diffexpngene
diffexpngenefilt
diffexpngenefilt = pd.DataFrame(diffexpngenefilt.groupby(["gene_id", "treatment_1", "treatment_2"], as_index=False)["value_1","value_2","log2(fold_change)"].mean())
diffexpngenefilt = diffexpngenefilt.loc[\
#(diffexpngene["value_1"] > 0.1) & \
#(diffexpngene["value_2"] > 0.1) & \
((diffexpngenefilt["value_1"] > 0.1) | (diffexpngenefilt["value_2"] > 0.1)) & \
#(diffexpngene["status"] == "OK") & \
(diffexpngenefilt["gene_id"] != np.NaN) & \
(((diffexpngenefilt["treatment_1"] == "ck") & (diffexpngenefilt["treatment_2"] == "cold")) | \
((diffexpngenefilt["treatment_2"] == "ck") & (diffexpngenefilt["treatment_1"] == "cold"))),:]
diffexpngenefilt
len(diffexpngenefilt["gene_id"].unique())
#diffexpngenefilt = pd.DataFrame(diffexpngenefilt.groupby(["gene_id", "treatment_1", "treatment_2"], as_index=False)["value_1","value_2","log2(fold_change)"].mean())
diffexpngenefilt
trtmt = "ck"
# Count unique genes with FPKM <= 1
lowval = 0.1
highval = 1
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_1"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_1"] == trtmt) & \
(diffexpngenefilt['value_1'] > lowval) & \
(diffexpngenefilt['value_1'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)
# Count unique genes with 1 < FPKM < 20
lowval = 1
highval = 20
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_1"] == trtmt])
midfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_1"] == trtmt) & \
(diffexpngenefilt['value_1'] > lowval) & \
(diffexpngenefilt['value_1'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)
# Count unique genes with FPKM > 20
lowval = 20
highval = 1000000
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_1"] == trtmt])
highfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_1"] == trtmt) & \
(diffexpngenefilt['value_1'] > lowval) & \
(diffexpngenefilt['value_1'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
trtmt = "cold"
# Count unique genes with FPKM <= 1
lowval = 0.1
highval = 1
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_2"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_2"] == trtmt) & \
(diffexpngenefilt['value_2'] > lowval) & \
(diffexpngenefilt['value_2'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)
# Count unique genes with 1 < FPKM < 20
lowval = 1
highval = 20
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_2"] == trtmt])
midfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_2"] == trtmt) & \
(diffexpngenefilt['value_2'] > lowval) & \
(diffexpngenefilt['value_2'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)
# Count unique genes with FPKM > 20
lowval = 20
highval = 1000000
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_2"] == trtmt])
highfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_2"] == trtmt) & \
(diffexpngenefilt['value_2'] > lowval) & \
(diffexpngenefilt['value_2'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
These values are in the neighborhood of what we expected, but not quite right. For now, we'll run with them. We had issues with versioning of several softwares as well as potentially with our analysis itself. Our best bet is to just merge our results with the authors' files downstream.