import pandas as pd
import numpy as np
import glob 
import os 
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib
import utils

data = pd.read_excel("raw/24-04-03/Lifecycle fitness tests.xlsx")

from utils import COLORS
EXPORT_PATH = '06_fitness_assays'
WWW_PATH = "www"
if not os.path.exists(EXPORT_PATH):
    os.mkdir(EXPORT_PATH)
if not os.path.exists(WWW_PATH):
    os.mkdir(WWW_PATH)

names = {"Line-17":r"LCS+ ancestor", 
         "Line-17 mutS": "LCS- ancestor", 
         "Line-17 mutS mutL*":"LCS- ancestor $mutL$",
         "SBW25":"SBW25",
         "SBW25 mutS*": "SBW25 $mutS^*$",
         "SBW25 mutL*":"SBW25 $mutL*$"}

def convert_PE(pe):
    if pe in names: 
        return names[pe]
    
    pe = pe.replace("mutL","$mutL^{WT}$")
    pe = pe.replace("wssE","$wssE^{WT}$")

    if 'PE' not in pe:
        return None
    pe = pe.replace('_','-')
    _, cycle, exp, microcosm = pe.split('-')
    cycle = int(cycle)+1
    return f"{cycle}-I-{microcosm}"

data["microcosm"] = [convert_PE(x) for x in data.Strain]
data["treatment"] = ["ancestor" if "PE" not in x else "LCS-" if "WT" in x else "LCS+" for x in data.Strain]

data.replace('na',np.nan, inplace=True)
data.replace('Na',np.nan, inplace=True)
data.replace('9/37',np.nan, inplace=True)
data.head()

/tmp/ipykernel_348611/246941747.py:25: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data.replace('na',np.nan, inplace=True)
/tmp/ipykernel_348611/246941747.py:26: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data.replace('Na',np.nan, inplace=True)
/tmp/ipykernel_348611/246941747.py:27: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data.replace('9/37',np.nan, inplace=True)

mutL = ['Line-17 mutS mutL*', 'PE-10-17-A2',
       'PE-10-17-C2', 'PE-10-17-C4', 'PE-10-17-C5', 'PE-10-17-D7',
       'PE-10-17-D8', 'PE-10-17-E8', 'PE-10-17-F3', 'PE-10-17-F6',
       'PE-10-WT-A6', 'PE-10-WT-B2', 'PE-10-WT-B5', 'PE-10-WT-C6',
       'PE-10-WT-E1', 'PE-10-WT-E7', 'PE-10-WT-F2', 
        'PE-2-WT-A1', 'PE-2-WT-A5', 'PE-3-WT-E2', 'PE-4-WT-E3 ', 'PE-4-WT-E3 wssE',
       'PE-5-WT-A5', 'PE-5-WT-C3', 'PE-5-WT-E3','SBW25 mutL*']

cols = ["Extinct","Mat","Count WS", "Count SM"]
for i, r in data.iterrows():
    for c in cols:
        try:
            if not np.isnan(r[c]):
                int(r[c])
        except Exception as ex:
            print(ex, i, c,  r[c], r["Strain"])

strains = []
for (strain,size,treatment), df in data.groupby(['Strain',"Size", "treatment"]):
    strains.append({"strain": strain, 
                    "size": "L" if size.lower()=="large" else "S",
                    "mutL": strain in mutL,
                    "treatment": treatment,
                    "microcosm": df.microcosm.unique()[0]})
    for p, d in df.groupby("Phase"):
        strains[-1][f"n_phase{p}"] = d.shape[0]
        strains[-1][f"extinct_phase{p}"] = int((d.Extinct.dropna()>0).sum())
        strains[-1][f"mat_phase{p}"] = int((d.Mat.dropna()>0).sum())
        strains[-1][f"WS_phase{p}"] = int((np.float64(d["Count WS"].dropna())>0).sum())
        strains[-1][f"SM_phase{p}"] = int((d["Count SM"].dropna()>0).sum())

strains = pd.DataFrame(strains)
strains["proba_mat_p1"] = strains.mat_phase1/strains.n_phase1
strains["proba_SM_p1"] = strains.SM_phase1/strains.n_phase1
strains["proba_WS_p2"] = strains.WS_phase2/strains.n_phase2
strains["proba_lifecycle"] = strains.proba_mat_p1 * strains.proba_SM_p1 * strains.proba_WS_p2 
strains.head()

import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
    strains.to_sql("fitness_assays", database, if_exists="replace", index=False)

strains.to_csv(os.path.join(EXPORT_PATH,"fitness_assays.csv"))
border =  [{'selector': 'th', 'props': 'border-left: 1px solid black'},
           {'selector': 'td', 'props': 'border-left: 1px solid black'}]
html = strains.style.set_table_styles({
    ('ID','name'): border
}, overwrite=False, axis=0)\
.set_sticky(axis="index")\
.set_sticky(axis="columns")\
.bar(subset=[x for x in strains.columns if "proba" in x], color=matplotlib.colors.to_hex('C0'))\
.to_html()

with open(os.path.join(WWW_PATH, "fitness_assays.html"),'w') as f:
    f.write("<html><head><meta charset=\"utf-8\"><title>Fitness Assays Data</title><style> table, th, td {border: 1px solid} table{border-collapse: collapse;} thead{background-color:white;} </style></head><body>")
    f.write(html)
    f.write("</body>")

plots = [["mat_phase1", "Probability to avoid soma failure\n during Ph. I"],
         ['SM_phase1', "Probability to produce germ cells\n during Ph. I"],
         ['WS_phase2', "Probability to produce soma cells\n during Ph. II"]]

def plotproba(col, df, title, ax,number=True, col_color=None): 
    labels = []
    yticks = []
    twinlabels = []
    u = 0
    for _,row in df.iterrows():
        u+=1
        phase = 1 if 'phase1' in col else '2'
        if col_color:
            c = 'C1' if row[col_color] else "C0"
        else:
            c = 'C0'
        number = row[f"n_phase{phase}"]
        posterior = scipy.stats.beta(1+row[col],1+number-row[col])
        proba = row[col]/number

        labels.append(f"{row.microcosm.strip():>30}")
        twinlabels.append(f" {proba:03.2f} ({row[col]}/{number})")


        yticks.append(u)
        ax.scatter(proba,u, color=c)
        ax.barh(u,posterior.interval(0.50)[1]-posterior.interval(0.50)[0], left=posterior.interval(0.50)[0], alpha=0.5, color=c)
        ax.hlines(u,xmin=posterior.interval(0.95)[0], xmax=posterior.interval(0.95)[1], color=c, lw=4)

        ax.text(proba,u+0.4, f"{proba:03.2f}", ha='center', va='top')
        try:
            ax.text(proba,u-0.4, f"{int(row[col])}/{int(number)}", ha='center')
        except Exception:
            pass
    for u in yticks:
        ax.axhline(u, color='k', ls=":", alpha=0.5)

    ax.set(yticks=yticks, yticklabels=labels, 
          xlim=(0,1))
    ax.set_title(title, font={'weight':'bold'})
    
    
def plotproba2(col, df, title, ax, col_color=None): 
    labels = []
    yticks = []
    twinlabels = []
    u = 0
    for _,row in df.iterrows():
        u+=1
        if col_color:
            c = 'C1' if row[col_color] else "C0"
        else:
            c = 'C0'
        proba = row[col]
        yticks.append(u)
        ax.scatter(proba,u, color=c)
        ax.text(proba,u+0.4, f"{proba:03.2f}", ha='center', va='top')
    for u in yticks:
        ax.axhline(u, color='k', ls=":", alpha=0.5)

    ax.set(yticks=yticks, yticklabels=labels, 
          xlim=(0,1))
    ax.set_title(title, font={'weight':'bold'})
    
def plot(df, size=(20,18)):
    fig, ax =plt.subplots(2,2, figsize=size,layout='constrained', sharey=True)
    plotproba2("proba_lifecycle", df, "Probability to complete the life cycle", ax[1,1])    
    for i, (col,title) in enumerate(plots):
        plotproba(col, df, title, ax.flat[i]) 
    return fig,ax

strains["endpoint"] = ["11" in x for x in strains.microcosm]

st = strains[[('wss' not in x and 'mut' not in x) for x in strains.microcosm]]
for t in utils.TREATMENTS:
    size, genotype = t.split("-",1)
    data = pd.concat([st.query(f"size=='{size}' and treatment=='{genotype}' and endpoint==True").sort_values('proba_lifecycle', ascending=False),
                      st.query(f"size=='{size}' and microcosm=='{genotype} ancestor'")
                     ])
    fig,ax = plot(data,(10,9) )
    fig.suptitle(t)
    print(size, genotype)
    plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.pdf"),bbox_inches='tight')
    plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.png"),bbox_inches='tight')
    plt.show()

S LCS+

L LCS+

S LCS-

L LCS-

fig,ax = plot(strains[[x=='L' for x in strains['size']]].sort_values('proba_lifecycle')) 
fig.suptitle("Large microcosms")
plt.savefig(os.path.join(EXPORT_PATH, "proba_large.pdf"),bbox_inches='tight')

fig,ax = plot(strains[[x=='S' for x in strains['size']]].sort_values('proba_lifecycle')) 
fig.suptitle("Small microcosms")
plt.savefig(os.path.join(EXPORT_PATH, "proba_small.pdf"),bbox_inches='tight')

to_disp = ['Line-17 mutS', 
           'PE-2-WT-A5',
           'PE-2-WT-A5 mutL',
           'PE-3-WT-E2', 
           'PE-4-WT-E3 ',
           'PE-5-WT-E3',
           ][::-1]

fig,ax = plot(strains[[x=='L' for x in strains['size']]].set_index('strain').loc[to_disp,:], (10,8))
plt.savefig(os.path.join(EXPORT_PATH, "proba.pdf"),bbox_inches='tight')

def plot_wide(df, size=(14,5), color=None):
    fig, ax =plt.subplots(1,4, figsize=size,layout='constrained', sharey=True)
    plotproba2("proba_lifecycle", df, "a. Probability to complete\n the life cycle", ax[0], col_color=color)    
    for i, (col,title) in enumerate(plots):
        plotproba(col, df, "bcd"[i]+". "+title, ax[1+i], col_color=color) 
    return fig,ax

st = strains[[('wss' not in x and 'mut' not in x) for x in strains.microcosm]]
for t in utils.TREATMENTS:
    size, genotype = t.split("-",1)
    data = pd.concat([st.query(f"size=='{size}' and treatment=='{genotype}' and endpoint==True").sort_values('proba_lifecycle', ascending=False),
                      st.query(f"size=='{size}' and microcosm=='{genotype} ancestor'")
                     ])
    fig,ax = plot_wide(data)
    fig.suptitle(t, fontweight='bold',fontsize=20)
    print(size, genotype)
    plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.pdf"),bbox_inches='tight')
    plt.savefig(os.path.join(EXPORT_PATH, f"proba_{t}.png"),bbox_inches='tight')
    plt.show()

S LCS+

L LCS+

S LCS-

L LCS-

to_disp = ['Line-17 mutS', 
           'PE-2-WT-A5',
           'PE-2-WT-A5 mutL',
           'PE-3-WT-E2', 
           'PE-4-WT-E3 ',
           'PE-5-WT-E3',
           ][::-1]

fig,ax = plot_wide(strains[[x=='L' for x in strains['size']]].set_index('strain').loc[to_disp,:], color='mutL')
plt.savefig(os.path.join(EXPORT_PATH, "proba.pdf"),bbox_inches='tight')

	Date	Strain	Size	Rack	Tube	Phase	Mat	Count WS	Count SM	Extinct	Replaced by	microcosm	treatment
0	2016-11-15	Line-17 mutS	small	I	1	1	1.0	100.0	0.0	1.0	I6	LCS- ancestor	ancestor
1	2016-11-15	Line-17 mutS	small	I	2	1	1.0	100.0	0.0	1.0	I6	LCS- ancestor	ancestor
2	2016-11-15	Line-17 mutS	small	I	3	1	1.0	108.0	0.0	1.0	I5	LCS- ancestor	ancestor
3	2016-11-15	Line-17 mutS	small	I	4	1	1.0	116.0	0.0	1.0	I5	LCS- ancestor	ancestor
4	2016-11-15	Line-17 mutS	small	I	5	1	1.0	40.0	11.0	0.0	0	LCS- ancestor	ancestor

	strain	size	mutL	treatment	microcosm	n_phase1	extinct_phase1	mat_phase1	WS_phase1	SM_phase1	n_phase2	extinct_phase2	mat_phase2	WS_phase2	SM_phase2	proba_mat_p1	proba_SM_p1	proba_WS_p2	proba_lifecycle
0	Line-17	L	False	ancestor	LCS+ ancestor	72	50	24	69	70	72	0	31	63	63	0.333333	0.972222	0.875000	0.283565
1	Line-17	S	False	ancestor	LCS+ ancestor	72	4	69	72	71	72	0	58	71	71	0.958333	0.986111	0.986111	0.931898
2	Line-17 mutS	L	False	ancestor	LCS- ancestor	184	156	51	179	158	138	25	32	112	136	0.277174	0.858696	0.811594	0.193166
3	Line-17 mutS	S	False	ancestor	LCS- ancestor	88	51	60	88	65	88	13	40	69	72	0.681818	0.738636	0.784091	0.394880
4	Line-17 mutS mutL*	S	True	ancestor	LCS- ancestor $mutL$	48	11	35	45	46	37	0	18	35	35	0.729167	0.958333	0.945946	0.661013