Search
Pipelining and differential expression analysis (Python)

Now for the moment we've all been waiting for: how do our differential expression results line up with the figures published by the authors?

First, we need to read in the differential expression data, stored for transcript-level differential expression data in the file isoform_exp.diff from cuffdiff. You'll notice below that the data is stored at a Dropbox link. That's because it's too big for GitHub. Feel free to change the path to your own file if you ran the differential expression steps yourself (congratulations, too!).

import pandas as pd
import numpy as np

ids = "SRR633516 SRR633540 SRR633541 SRR633542 SRR633543 SRR633549 SRR633548 SRR633547 SRR633546 SRR633544 SRR633545".split(" ")
qvals = ["q" + str(curr) for curr in list(range(1,12))]

labels_fixed = dict(zip(qvals,ids))
#diffexpn = pd.read_csv("../../bvcn-sample/data/isoform_exp.diff", sep = "\t").replace(to_replace=labels_fixed)
diffexpn = pd.read_csv("https://www.dropbox.com/s/kexe5j6d7f1oqrg/isoform_exp_28Apr.diff?dl=1", sep = "\t").replace(to_replace=labels_fixed)
#diffexpn = diffexpn.loc[diffexpn["status"] == "OK",:]
diffexpn
test_id gene_id gene locus sample_1 sample_2 status value_1 value_2 log2(fold_change) test_stat p_value q_value significant
0 TCONS_00000001 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 OK 5.048820 3.691990 -0.451547 -0.117276 0.83120 0.989492 no
1 TCONS_00000002 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.000000 0.631836 inf 0.000000 1.00000 1.000000 no
2 TCONS_00000003 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no
3 TCONS_00000004 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.000000 0.483925 inf 0.000000 1.00000 1.000000 no
4 TCONS_00000005 XLOC_000002 - 1:18715-23389 SRR633516 SRR633540 NOTEST 0.002775 0.003065 0.143296 0.000000 1.00000 1.000000 no
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3568505 TCONS_00064891 XLOC_046176 - KZ116064.1:28517-32690 SRR633544 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no
3568506 TCONS_00064892 XLOC_046177 - KZ116064.1:67456-82859 SRR633544 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no
3568507 TCONS_00064893 XLOC_046178 - KZ116065.1:2332-20619 SRR633544 SRR633545 NOTEST 0.549827 0.536989 -0.034086 0.000000 1.00000 1.000000 no
3568508 TCONS_00064894 XLOC_046179 - KZ116066.1:46423-46539 SRR633544 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no
3568509 TCONS_00064895 XLOC_046180 - MT:1-16596 SRR633544 SRR633545 OK 1499.220000 1471.660000 -0.026770 -0.071112 0.47885 0.989492 no

3568510 rows × 14 columns

ids = "SRR633516 SRR633540 SRR633541 SRR633542 SRR633543 SRR633549 SRR633548 SRR633547 SRR633546 SRR633544 SRR633545".split(" ")
treatments = ["ck","ck","ck","ck","ck","cold","cold","cold","cold","cold","cold"]

treatments_fixed = dict(zip(ids,treatments))
diffexpn["treatment_1"] = diffexpn["sample_1"].replace(to_replace=treatments_fixed)
diffexpn["treatment_2"] = diffexpn["sample_2"].replace(to_replace=treatments_fixed)
list(diffexpn["log2(fold_change)"][diffexpn["log2(fold_change)"] > 0])
diffexpn.loc[diffexpn["gene_id"] == "XLOC_000001",:]
test_id gene_id gene locus sample_1 sample_2 status value_1 value_2 log2(fold_change) test_stat p_value q_value significant treatment_1 treatment_2
0 TCONS_00000001 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 OK 5.04882 3.691990 -0.451547 -0.117276 0.83120 0.989492 no ck ck
1 TCONS_00000002 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.00000 0.631836 inf 0.000000 1.00000 1.000000 no ck ck
2 TCONS_00000003 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.00000 0.000000 0.000000 0.000000 1.00000 1.000000 no ck ck
3 TCONS_00000004 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.00000 0.483925 inf 0.000000 1.00000 1.000000 no ck ck
64882 TCONS_00000001 XLOC_000001 - 1:6407-16373 SRR633516 SRR633541 OK 5.04882 4.777240 -0.079768 -0.022089 0.87755 0.989492 no ck ck
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3438749 TCONS_00000004 XLOC_000001 - 1:6407-16373 SRR633546 SRR633545 NOTEST 0.00000 0.000000 0.000000 0.000000 1.00000 1.000000 no cold cold
3503628 TCONS_00000001 XLOC_000001 - 1:6407-16373 SRR633544 SRR633545 OK 2.89373 3.379860 0.224033 0.047890 0.90775 0.989492 no cold cold
3503629 TCONS_00000002 XLOC_000001 - 1:6407-16373 SRR633544 SRR633545 OK 1.97671 2.160660 0.128377 0.021892 0.94345 0.989492 no cold cold
3503630 TCONS_00000003 XLOC_000001 - 1:6407-16373 SRR633544 SRR633545 NOTEST 0.00000 0.005295 inf 0.000000 1.00000 1.000000 no cold cold
3503631 TCONS_00000004 XLOC_000001 - 1:6407-16373 SRR633544 SRR633545 NOTEST 1.02004 0.000000 -inf 0.000000 1.00000 1.000000 no cold cold

220 rows × 16 columns

diffexpnunique = pd.DataFrame(diffexpn.groupby(["gene_id", "treatment_1", "treatment_2"], as_index=False)["value_1","value_2","log2(fold_change)"].mean())
diffexpnunique = diffexpnunique.loc[\
                                    #(diffexpnunique["value_1"] > 0.1) & \
                                    #(diffexpnunique["value_2"] > 0.1) & \
                                    (diffexpnunique["gene_id"] != np.NaN) & \
                                    (diffexpnunique["treatment_1"] == "ck") & \
                                    (diffexpnunique["treatment_2"] == "cold"),:]

diffexpnunique = diffexpn.loc[\
                                    #(diffexpnunique["value_1"] > 0.1) & \
                                    #(diffexpnunique["value_2"] > 0.1) & \
                                    (diffexpn["gene_id"] != np.NaN) & \
                                    (diffexpn["treatment_1"] == "ck") & \
                                    (diffexpn["treatment_2"] == "cold"),:]
list(sorted(diffexpnunique["log2(fold_change)"],reverse=False))
diffexpnunique.loc[diffexpnunique["log2(fold_change)"] > 0.1, :]
diffexpnunique
test_id gene_id gene locus sample_1 sample_2 status value_1 value_2 log2(fold_change) test_stat p_value q_value significant treatment_1 treatment_2
648820 TCONS_00000001 XLOC_000001 - 1:6407-16373 SRR633516 SRR633549 OK 5.048820 5.040980 -0.002243 -0.000653 0.8598 0.989492 no ck cold
648821 TCONS_00000002 XLOC_000001 - 1:6407-16373 SRR633516 SRR633549 NOTEST 0.000000 0.000000 0.000000 0.000000 1.0000 1.000000 no ck cold
648822 TCONS_00000003 XLOC_000001 - 1:6407-16373 SRR633516 SRR633549 NOTEST 0.000000 0.000000 0.000000 0.000000 1.0000 1.000000 no ck cold
648823 TCONS_00000004 XLOC_000001 - 1:6407-16373 SRR633516 SRR633549 NOTEST 0.000000 0.000000 0.000000 0.000000 1.0000 1.000000 no ck cold
648824 TCONS_00000005 XLOC_000002 - 1:18715-23389 SRR633516 SRR633549 NOTEST 0.002775 0.000133 -4.385250 0.000000 1.0000 1.000000 no ck cold
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3244095 TCONS_00064891 XLOC_046176 - KZ116064.1:28517-32690 SRR633543 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.0000 1.000000 no ck cold
3244096 TCONS_00064892 XLOC_046177 - KZ116064.1:67456-82859 SRR633543 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.0000 1.000000 no ck cold
3244097 TCONS_00064893 XLOC_046178 - KZ116065.1:2332-20619 SRR633543 SRR633545 NOTEST 0.406668 0.536989 0.401043 0.000000 1.0000 1.000000 no ck cold
3244098 TCONS_00064894 XLOC_046179 - KZ116066.1:46423-46539 SRR633543 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.0000 1.000000 no ck cold
3244099 TCONS_00064895 XLOC_046180 - MT:1-16596 SRR633543 SRR633545 OK 1490.390000 1471.660000 -0.018246 -0.048468 0.4852 0.989492 no ck cold

1946460 rows × 16 columns

trtmt = "ck"
# Count unique genes with FPKM <= 1

lowval = 0.1
highval = 1
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
                             (diffexpnunique['value_1'] > lowval) & \
                             (diffexpnunique['value_1'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)

# Count unique genes with 1 < FPKM < 20

lowval = 1
highval = 20
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
midfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
                             (diffexpnunique['value_1'] > lowval) & \
                             (diffexpnunique['value_1'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)

# Count unique genes with FPKM > 20

lowval = 20
highval = 1000000
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
highfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
                             (diffexpnunique['value_1'] > lowval) & \
                             (diffexpnunique['value_1'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
1946460
FPKM <= 1
248064
0.12744366696464351
1 < FPKM <= 20
865470
0.4446379581393915
FPKM > 20
126114
0.06479146758731236
## Now for cold genes
trtmt = "ck"
# Count unique genes with FPKM <= 1

lowval = -100000
highval = 1
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
                             (diffexpnunique['value_2'] > lowval) & \
                             (diffexpnunique['value_2'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)

# Count unique genes with 1 < FPKM < 20

lowval = 1
highval = 20
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
midfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
                             (diffexpnunique['value_2'] > lowval) & \
                             (diffexpnunique['value_2'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)

# Count unique genes with FPKM > 20

lowval = 20
highval = 1000000
totalgenes = len(diffexpnunique['gene_id'][diffexpnunique["treatment_1"] == trtmt])
highfpkm = len(diffexpnunique['gene_id'][(diffexpnunique["treatment_1"] == trtmt) & \
                             (diffexpnunique['value_2'] > lowval) & \
                             (diffexpnunique['value_2'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
1946460
FPKM <= 1
976945
0.501908593035562
1 < FPKM <= 20
844490
0.4338594165818974
FPKM > 20
125025
0.06423199038254061

Now let's do the same thing for the gene-level differential expression data, stored in gene_exp.diff.

diffexpngenedl = pd.read_csv("https://www.dropbox.com/s/kexe5j6d7f1oqrg/gene_exp_28Apr.diff?dl=1", sep = "\t").replace(to_replace=labels_fixed)

For this gene differential expression data, we need to include the correspondence between the accession numbers and the treatments so that we can filter out the treatments based on whether they were control or cold-treated.

ids = "SRR633516 SRR633540 SRR633541 SRR633542 SRR633543 SRR633549 SRR633548 SRR633547 SRR633546 SRR633544 SRR633545".split(" ")
treatments = ["ck","ck","ck","ck","ck","cold","cold","cold","cold","cold","cold"]

treatments_fixed = dict(zip(ids,treatments))
diffexpngene = diffexpngenedl.copy(deep=True)
diffexpngene["treatment_1"] = diffexpngene["sample_1"].replace(to_replace=treatments_fixed)
diffexpngene["treatment_2"] = diffexpngene["sample_2"].replace(to_replace=treatments_fixed)

diffexpngenefilt = diffexpngene
diffexpngenefilt
test_id gene_id gene locus sample_1 sample_2 status value_1 value_2 log2(fold_change) test_stat p_value q_value significant treatment_1 treatment_2
0 TCONS_00000001 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 OK 5.048820 3.691990 -0.451547 -0.117276 0.83120 0.989492 no ck ck
1 TCONS_00000002 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.000000 0.631836 inf 0.000000 1.00000 1.000000 no ck ck
2 TCONS_00000003 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no ck ck
3 TCONS_00000004 XLOC_000001 - 1:6407-16373 SRR633516 SRR633540 NOTEST 0.000000 0.483925 inf 0.000000 1.00000 1.000000 no ck ck
4 TCONS_00000005 XLOC_000002 - 1:18715-23389 SRR633516 SRR633540 NOTEST 0.002775 0.003065 0.143296 0.000000 1.00000 1.000000 no ck ck
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3568505 TCONS_00064891 XLOC_046176 - KZ116064.1:28517-32690 SRR633544 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no cold cold
3568506 TCONS_00064892 XLOC_046177 - KZ116064.1:67456-82859 SRR633544 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no cold cold
3568507 TCONS_00064893 XLOC_046178 - KZ116065.1:2332-20619 SRR633544 SRR633545 NOTEST 0.549827 0.536989 -0.034086 0.000000 1.00000 1.000000 no cold cold
3568508 TCONS_00064894 XLOC_046179 - KZ116066.1:46423-46539 SRR633544 SRR633545 NOTEST 0.000000 0.000000 0.000000 0.000000 1.00000 1.000000 no cold cold
3568509 TCONS_00064895 XLOC_046180 - MT:1-16596 SRR633544 SRR633545 OK 1499.220000 1471.660000 -0.026770 -0.071112 0.47885 0.989492 no cold cold

3568510 rows × 16 columns

diffexpngenefilt = pd.DataFrame(diffexpngenefilt.groupby(["gene_id", "treatment_1", "treatment_2"], as_index=False)["value_1","value_2","log2(fold_change)"].mean())

diffexpngenefilt = diffexpngenefilt.loc[\
                              #(diffexpngene["value_1"] > 0.1) & \
                              #(diffexpngene["value_2"] > 0.1) & \
                              ((diffexpngenefilt["value_1"] > 0.1) | (diffexpngenefilt["value_2"] > 0.1)) & \
                              #(diffexpngene["status"] == "OK") & \
                              (diffexpngenefilt["gene_id"] != np.NaN) & \
                              (((diffexpngenefilt["treatment_1"] == "ck") & (diffexpngenefilt["treatment_2"] == "cold")) | \
                              ((diffexpngenefilt["treatment_2"] == "ck") & (diffexpngenefilt["treatment_1"] == "cold"))),:]
diffexpngenefilt
len(diffexpngenefilt["gene_id"].unique())
35927
#diffexpngenefilt = pd.DataFrame(diffexpngenefilt.groupby(["gene_id", "treatment_1", "treatment_2"], as_index=False)["value_1","value_2","log2(fold_change)"].mean())
diffexpngenefilt
gene_id treatment_1 treatment_2 value_1 value_2 log2(fold_change)
1 XLOC_000001 ck cold 1.164626 1.469764 NaN
4 XLOC_000002 ck cold 3.192895 3.855536 0.898441
10 XLOC_000004 ck cold 7.284566 6.563303 inf
13 XLOC_000005 ck cold 7.095824 3.641068 NaN
16 XLOC_000006 ck cold 1.905134 1.286479 -0.584266
... ... ... ... ... ... ...
138514 XLOC_046172 ck cold 0.532399 1.679405 1.666810
138517 XLOC_046173 ck cold 0.557760 0.797891 NaN
138523 XLOC_046175 ck cold 136.701700 160.303833 0.403002
138532 XLOC_046178 ck cold 0.291451 0.510556 0.928998
138538 XLOC_046180 ck cold 1468.348000 1504.838333 0.033561

35927 rows × 6 columns

trtmt = "ck"
# Count unique genes with FPKM <= 1

lowval = 0.1
highval = 1
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_1"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_1"] == trtmt) & \
                             (diffexpngenefilt['value_1'] > lowval) & \
                             (diffexpngenefilt['value_1'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)

# Count unique genes with 1 < FPKM < 20

lowval = 1
highval = 20
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_1"] == trtmt])
midfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_1"] == trtmt) & \
                             (diffexpngenefilt['value_1'] > lowval) & \
                             (diffexpngenefilt['value_1'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)

# Count unique genes with FPKM > 20

lowval = 20
highval = 1000000
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_1"] == trtmt])
highfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_1"] == trtmt) & \
                             (diffexpngenefilt['value_1'] > lowval) & \
                             (diffexpngenefilt['value_1'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
35927
FPKM <= 1
6221
0.17315667882094246
1 < FPKM <= 20
25592
0.7123333426114065
FPKM > 20
3622
0.10081554262810699
trtmt = "cold"
# Count unique genes with FPKM <= 1

lowval = 0.1
highval = 1
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_2"] == trtmt])
print(totalgenes)
lowfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_2"] == trtmt) & \
                             (diffexpngenefilt['value_2'] > lowval) & \
                             (diffexpngenefilt['value_2'] <= highval)])
print("FPKM <= 1")
print(lowfpkm)
print(lowfpkm / totalgenes)

# Count unique genes with 1 < FPKM < 20

lowval = 1
highval = 20
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_2"] == trtmt])
midfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_2"] == trtmt) & \
                             (diffexpngenefilt['value_2'] > lowval) & \
                             (diffexpngenefilt['value_2'] <= highval)])
print("1 < FPKM <= 20")
print(midfpkm)
print(midfpkm / totalgenes)

# Count unique genes with FPKM > 20

lowval = 20
highval = 1000000
totalgenes = len(diffexpngenefilt['gene_id'][diffexpngenefilt["treatment_2"] == trtmt])
highfpkm = len(diffexpngenefilt['gene_id'][(diffexpngenefilt["treatment_2"] == trtmt) & \
                             (diffexpngenefilt['value_2'] > lowval) & \
                             (diffexpngenefilt['value_2'] <= highval)])
print("FPKM > 20")
print(highfpkm)
print(highfpkm / totalgenes)
35927
FPKM <= 1
6418
0.17864002004063795
1 < FPKM <= 20
25074
0.6979152169677402
FPKM > 20
3489
0.09711359144932781

These values are in the neighborhood of what we expected, but not quite right. For now, we'll run with them. We had issues with versioning of several softwares as well as potentially with our analysis itself. Our best bet is to just merge our results with the authors' files downstream.