In [1]:
import pandas as pd
import numpy as np
import glob
import os
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib
import utils
In [2]:
data = pd.read_excel("raw/24-04-03/Lifecycle fitness tests.xlsx")
from utils import COLORS
EXPORT_PATH = '06_fitness_assays'
WWW_PATH = "www"
if not os.path.exists(EXPORT_PATH):
os.mkdir(EXPORT_PATH)
if not os.path.exists(WWW_PATH):
os.mkdir(WWW_PATH)
In [3]:
names = {"Line-17":r"LCS+ ancestor",
"Line-17 mutS": "LCS- ancestor",
"Line-17 mutS mutL*":"LCS- ancestor $mutL$",
"SBW25":"SBW25",
"SBW25 mutS*": "SBW25 $mutS^*$",
"SBW25 mutL*":"SBW25 $mutL*$"}
def convert_PE(pe):
if pe in names:
return names[pe]
pe = pe.replace("mutL","$mutL^{WT}$")
pe = pe.replace("wssE","$wssE^{WT}$")
if 'PE' not in pe:
return None
pe = pe.replace('_','-')
_, cycle, exp, microcosm = pe.split('-')
cycle = int(cycle)+1
return f"{cycle}-I-{microcosm}"
data["microcosm"] = [convert_PE(x) for x in data.Strain]
data["treatment"] = ["ancestor" if "PE" not in x else "LCS-" if "WT" in x else "LCS+" for x in data.Strain]
data.replace('na',np.nan, inplace=True)
data.replace('Na',np.nan, inplace=True)
data.replace('9/37',np.nan, inplace=True)
data.head()
/tmp/ipykernel_348611/246941747.py:25: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` data.replace('na',np.nan, inplace=True) /tmp/ipykernel_348611/246941747.py:26: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` data.replace('Na',np.nan, inplace=True) /tmp/ipykernel_348611/246941747.py:27: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` data.replace('9/37',np.nan, inplace=True)
Out[3]:
Date | Strain | Size | Rack | Tube | Phase | Mat | Count WS | Count SM | Count FZ | Extinct | Replaced by | Condition | microcosm | treatment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-11-15 | Line-17 mutS | small | I | 1 | 1 | 1.0 | 100.0 | 0.0 | 0 | 1.0 | I6 | 0 | LCS- ancestor | ancestor |
1 | 2016-11-15 | Line-17 mutS | small | I | 2 | 1 | 1.0 | 100.0 | 0.0 | 0 | 1.0 | I6 | 0 | LCS- ancestor | ancestor |
2 | 2016-11-15 | Line-17 mutS | small | I | 3 | 1 | 1.0 | 108.0 | 0.0 | 0 | 1.0 | I5 | 0 | LCS- ancestor | ancestor |
3 | 2016-11-15 | Line-17 mutS | small | I | 4 | 1 | 1.0 | 116.0 | 0.0 | 0 | 1.0 | I5 | 0 | LCS- ancestor | ancestor |
4 | 2016-11-15 | Line-17 mutS | small | I | 5 | 1 | 1.0 | 40.0 | 11.0 | 0 | 0.0 | 0 | 0 | LCS- ancestor | ancestor |
In [4]:
mutL = ['Line-17 mutS mutL*', 'PE-10-17-A2',
'PE-10-17-C2', 'PE-10-17-C4', 'PE-10-17-C5', 'PE-10-17-D7',
'PE-10-17-D8', 'PE-10-17-E8', 'PE-10-17-F3', 'PE-10-17-F6',
'PE-10-WT-A6', 'PE-10-WT-B2', 'PE-10-WT-B5', 'PE-10-WT-C6',
'PE-10-WT-E1', 'PE-10-WT-E7', 'PE-10-WT-F2',
'PE-2-WT-A1', 'PE-2-WT-A5', 'PE-3-WT-E2', 'PE-4-WT-E3 ', 'PE-4-WT-E3 wssE',
'PE-5-WT-A5', 'PE-5-WT-C3', 'PE-5-WT-E3','SBW25 mutL*']
In [5]:
cols = ["Extinct","Mat","Count WS", "Count SM"]
for i, r in data.iterrows():
for c in cols:
try:
if not np.isnan(r[c]):
int(r[c])
except Exception as ex:
print(ex, i, c, r[c], r["Strain"])
In [6]:
strains = []
for (strain,size,treatment), df in data.groupby(['Strain',"Size", "treatment"]):
strains.append({"strain": strain,
"size": "L" if size.lower()=="large" else "S",
"mutL": strain in mutL,
"treatment": treatment,
"microcosm": df.microcosm.unique()[0]})
for p, d in df.groupby("Phase"):
strains[-1][f"n_phase{p}"] = d.shape[0]
strains[-1][f"extinct_phase{p}"] = int((d.Extinct.dropna()>0).sum())
strains[-1][f"mat_phase{p}"] = int((d.Mat.dropna()>0).sum())
strains[-1][f"WS_phase{p}"] = int((np.float64(d["Count WS"].dropna())>0).sum())
strains[-1][f"SM_phase{p}"] = int((d["Count SM"].dropna()>0).sum())
strains = pd.DataFrame(strains)
strains["proba_mat_p1"] = strains.mat_phase1/strains.n_phase1
strains["proba_SM_p1"] = strains.SM_phase1/strains.n_phase1
strains["proba_WS_p2"] = strains.WS_phase2/strains.n_phase2
strains["proba_lifecycle"] = strains.proba_mat_p1 * strains.proba_SM_p1 * strains.proba_WS_p2
strains.head()
Out[6]:
strain | size | mutL | treatment | microcosm | n_phase1 | extinct_phase1 | mat_phase1 | WS_phase1 | SM_phase1 | n_phase2 | extinct_phase2 | mat_phase2 | WS_phase2 | SM_phase2 | proba_mat_p1 | proba_SM_p1 | proba_WS_p2 | proba_lifecycle | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Line-17 | L | False | ancestor | LCS+ ancestor | 72 | 50 | 24 | 69 | 70 | 72 | 0 | 31 | 63 | 63 | 0.333333 | 0.972222 | 0.875000 | 0.283565 |
1 | Line-17 | S | False | ancestor | LCS+ ancestor | 72 | 4 | 69 | 72 | 71 | 72 | 0 | 58 | 71 | 71 | 0.958333 | 0.986111 | 0.986111 | 0.931898 |
2 | Line-17 mutS | L | False | ancestor | LCS- ancestor | 184 | 156 | 51 | 179 | 158 | 138 | 25 | 32 | 112 | 136 | 0.277174 | 0.858696 | 0.811594 | 0.193166 |
3 | Line-17 mutS | S | False | ancestor | LCS- ancestor | 88 | 51 | 60 | 88 | 65 | 88 | 13 | 40 | 69 | 72 | 0.681818 | 0.738636 | 0.784091 | 0.394880 |
4 | Line-17 mutS mutL* | S | True | ancestor | LCS- ancestor $mutL$ | 48 | 11 | 35 | 45 | 46 | 37 | 0 | 18 | 35 | 35 | 0.729167 | 0.958333 | 0.945946 | 0.661013 |
In [7]:
import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
strains.to_sql("fitness_assays", database, if_exists="replace", index=False)
In [8]:
strains.to_csv(os.path.join(EXPORT_PATH,"fitness_assays.csv"))
border = [{'selector': 'th', 'props': 'border-left: 1px solid black'},
{'selector': 'td', 'props': 'border-left: 1px solid black'}]
html = strains.style.set_table_styles({
('ID','name'): border
}, overwrite=False, axis=0)\
.set_sticky(axis="index")\
.set_sticky(axis="columns")\
.bar(subset=[x for x in strains.columns if "proba" in x], color=matplotlib.colors.to_hex('C0'))\
.to_html()
with open(os.path.join(WWW_PATH, "fitness_assays.html"),'w') as f:
f.write("<html><head><meta charset=\"utf-8\"><title>Fitness Assays Data</title><style> table, th, td {border: 1px solid} table{border-collapse: collapse;} thead{background-color:white;} </style></head><body>")
f.write(html)
f.write("</body>")
In [9]:
plots = [["mat_phase1", "Probability to avoid soma failure\n during Ph. I"],
['SM_phase1', "Probability to produce germ cells\n during Ph. I"],
['WS_phase2', "Probability to produce soma cells\n during Ph. II"]]
In [10]:
def plotproba(col, df, title, ax,number=True, col_color=None):
labels = []
yticks = []
twinlabels = []
u = 0
for _,row in df.iterrows():
u+=1
phase = 1 if 'phase1' in col else '2'
if col_color:
c = 'C1' if row[col_color] else "C0"
else:
c = 'C0'
number = row[f"n_phase{phase}"]
posterior = scipy.stats.beta(1+row[col],1+number-row[col])
proba = row[col]/number
labels.append(f"{row.microcosm.strip():>30}")
twinlabels.append(f" {proba:03.2f} ({row[col]}/{number})")
yticks.append(u)
ax.scatter(proba,u, color=c)
ax.barh(u,posterior.interval(0.50)[1]-posterior.interval(0.50)[0], left=posterior.interval(0.50)[0], alpha=0.5, color=c)
ax.hlines(u,xmin=posterior.interval(0.95)[0], xmax=posterior.interval(0.95)[1], color=c, lw=4)
ax.text(proba,u+0.4, f"{proba:03.2f}", ha='center', va='top')
try:
ax.text(proba,u-0.4, f"{int(row[col])}/{int(number)}", ha='center')
except Exception:
pass
for u in yticks:
ax.axhline(u, color='k', ls=":", alpha=0.5)
ax.set(yticks=yticks, yticklabels=labels,
xlim=(0,1))
ax.set_title(title, font={'weight':'bold'})
def plotproba2(col, df, title, ax, col_color=None):
labels = []
yticks = []
twinlabels = []
u = 0
for _,row in df.iterrows():
u+=1
if col_color:
c = 'C1' if row[col_color] else "C0"
else:
c = 'C0'
proba = row[col]
yticks.append(u)
ax.scatter(proba,u, color=c)
ax.text(proba,u+0.4, f"{proba:03.2f}", ha='center', va='top')
for u in yticks:
ax.axhline(u, color='k', ls=":", alpha=0.5)
ax.set(yticks=yticks, yticklabels=labels,
xlim=(0,1))
ax.set_title(title, font={'weight':'bold'})
def plot(df, size=(20,18)):
fig, ax =plt.subplots(2,2, figsize=size,layout='constrained', sharey=True)
plotproba2("proba_lifecycle", df, "Probability to complete the life cycle", ax[1,1])
for i, (col,title) in enumerate(plots):
plotproba(col, df, title, ax.flat[i])
return fig,ax
In [11]:
strains["endpoint"] = ["11" in x for x in strains.microcosm]
In [12]:
st = strains[[('wss' not in x and 'mut' not in x) for x in strains.microcosm]]
for t in utils.TREATMENTS:
size, genotype = t.split("-",1)
data = pd.concat([st.query(f"size=='{size}' and treatment=='{genotype}' and endpoint==True").sort_values('proba_lifecycle', ascending=False),
st.query(f"size=='{size}' and microcosm=='{genotype} ancestor'")
])
fig,ax = plot(data,(10,9) )
fig.suptitle(t)
print(size, genotype)
plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.pdf"),bbox_inches='tight')
plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.png"),bbox_inches='tight')
plt.show()
S LCS+
L LCS+
S LCS-
L LCS-
In [13]:
fig,ax = plot(strains[[x=='L' for x in strains['size']]].sort_values('proba_lifecycle'))
fig.suptitle("Large microcosms")
plt.savefig(os.path.join(EXPORT_PATH, "proba_large.pdf"),bbox_inches='tight')
In [14]:
fig,ax = plot(strains[[x=='S' for x in strains['size']]].sort_values('proba_lifecycle'))
fig.suptitle("Small microcosms")
plt.savefig(os.path.join(EXPORT_PATH, "proba_small.pdf"),bbox_inches='tight')
In [15]:
to_disp = ['Line-17 mutS',
'PE-2-WT-A5',
'PE-2-WT-A5 mutL',
'PE-3-WT-E2',
'PE-4-WT-E3 ',
'PE-5-WT-E3',
][::-1]
fig,ax = plot(strains[[x=='L' for x in strains['size']]].set_index('strain').loc[to_disp,:], (10,8))
plt.savefig(os.path.join(EXPORT_PATH, "proba.pdf"),bbox_inches='tight')
In [16]:
def plot_wide(df, size=(14,5), color=None):
fig, ax =plt.subplots(1,4, figsize=size,layout='constrained', sharey=True)
plotproba2("proba_lifecycle", df, "a. Probability to complete\n the life cycle", ax[0], col_color=color)
for i, (col,title) in enumerate(plots):
plotproba(col, df, "bcd"[i]+". "+title, ax[1+i], col_color=color)
return fig,ax
In [17]:
st = strains[[('wss' not in x and 'mut' not in x) for x in strains.microcosm]]
for t in utils.TREATMENTS:
size, genotype = t.split("-",1)
data = pd.concat([st.query(f"size=='{size}' and treatment=='{genotype}' and endpoint==True").sort_values('proba_lifecycle', ascending=False),
st.query(f"size=='{size}' and microcosm=='{genotype} ancestor'")
])
fig,ax = plot_wide(data)
fig.suptitle(t, fontweight='bold',fontsize=20)
print(size, genotype)
plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.pdf"),bbox_inches='tight')
plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.png"),bbox_inches='tight')
plt.show()
S LCS+
L LCS+
S LCS-
L LCS-
In [18]:
to_disp = ['Line-17 mutS',
'PE-2-WT-A5',
'PE-2-WT-A5 mutL',
'PE-3-WT-E2',
'PE-4-WT-E3 ',
'PE-5-WT-E3',
][::-1]
fig,ax = plot_wide(strains[[x=='L' for x in strains['size']]].set_index('strain').loc[to_disp,:], color='mutL')
plt.savefig(os.path.join(EXPORT_PATH, "proba.pdf"),bbox_inches='tight')