In [1]:
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
import colgen.display
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
import re
SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 25

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
In [2]:
# Prepare export
from utils import DATASETS, format_tree_labels, COLORS, rack_layout, TREATMENTS
EXPORT_PATH = '05_mutation_propagation'
if not os.path.exists(EXPORT_PATH):
    os.mkdir(EXPORT_PATH)
In [3]:
# Load DATA
ssq = pd.read_csv("04_mutations/sequenced_cultures.csv")
mutation_description = pd.read_csv("04_mutations/mutation_description.csv", index_col=0)
trees = pd.concat([pd.read_csv(k) for k in DATASETS])

sequenced_nodes = frozenset(ssq.parent.unique())

# Read the propagated data from Colgen
all_data = []
for f in glob.glob("04_mutations/propagated/*mapping*.csv"):
    m = pd.read_csv(f).set_index('name') 
    m = m[[x for x in m.columns if ("MUT" in x)]]
    m = m.melt(var_name="mut_id", ignore_index=False, value_name="present")
    all_data.append(m[m.present==1])
mutation_longtable = pd.concat(all_data)

# Add the description of mutations
mutations = pd.merge(mutation_longtable.reset_index(),
                     mutation_description.drop(columns="name").reset_index(),
                     how='left')

### FILTERING ### 

sequenced = pd.read_csv("04_mutations/sequencing_long_table.csv", index_col=0)

mutations.rename(columns={'name':'node'}, inplace=True)

# Export all_propagated_mutation_data
mutations.to_csv(os.path.join(EXPORT_PATH, "all_propagated_mutation_data.csv"))

import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
    mutations.loc[:, ["mut_id","node"]].rename(columns={'node':'name'}).to_sql("propagated_mutations", database, index=False, if_exists="replace",)
In [4]:
mutations['treatment'] = [x.split('_')[0] for x in mutations.node]
mutations['background'] = [x.split('-',1)[1] for x in mutations.treatment]

mut_mutation_description = mutation_description[['mut' in x.gene for k,x in mutation_description.iterrows()]]
never_filter_out = frozenset(mut_mutation_description.name)


filtered = []
for bg, dt in mutations.groupby('background'):
    to_filter_out = frozenset(sequenced[sequenced.genotype==f"{bg} ancestor"].mut_id)
    to_filter_out = to_filter_out-never_filter_out
    filtered.append(dt[[x not in to_filter_out for x in dt.mut_id]])
filtered = pd.concat(filtered)

# Now we proceed with the filtered dataset
mutations = filtered
In [5]:
# Keep only the mut ,mutations 
mut_mutations = mutations[['mut' in x.gene for k,x in mutations.iterrows()]].copy()
mut_mutations['gene_name'] = [x.split()[0] for x in mut_mutations.gene]
mut_mutations['mutation_name'] = [x.gene.split()[0]+' '+x.annotation.split()[0] for k,x in mut_mutations.iterrows()]
print(f"{mut_mutations.shape[0]} mutations identified in genes {mut_mutations.gene_name.unique()}")
2869 mutations identified in genes ['mutY' 'mutL' 'mutS' 'mutM']
In [6]:
def simplify(x):
    if x[0] == "(" and x[2] == ")":
        return f"Track of {x[1]}"
    if x[0] == "(":
        return "Motif #"
    if x[0] == "Δ":
        return "Deletion"
    if 'bp' in x:
        return "bp → XX"
    return x

mutations['mutation_simple'] = [simplify(x) for x in mutations.mutation]
In [7]:
def label_motif(u, number=False):
    if number:
        par = lambda x: f" ({x} mutations)" if x>1 else ""
    else:
        par = lambda x: ''
    muts,mutl,muty = [int(float(x)) for x in u.split('-')]
    s = []
    if mutl:
        s.append("$mutL$"+par(mutl))
    if muts:
        s.append("$mutS$"+par(muts))
    if muty:
        s.append("$mutY$"+par(muty))
    if not s:
        s =['no $mut$ mutation']
    return " and ".join(s)
In [8]:
# Compute the mut-patterns
mut_mutations['present'] = 1 
mut = mut_mutations.pivot_table(index="node",columns="gene_name", values="present",aggfunc='sum').fillna(0)

# Add the number of mutations and the nodes without mut-mutations
count_mutations = mutations.groupby('node').count().rename(columns={'mut_id':'number_of_mutations'}).loc[:,"number_of_mutations"].reset_index()
mut = mut.astype(int)
mut = pd.merge(mut, count_mutations, how='right',left_index=True, right_on="node").fillna(0)

mut['mut_combination'] = [f"{x['mutS']}-{x['mutL']}-{x['mutY']}" for _, x in mut.iterrows()]
mut['mut_combination_simple'] = [f"{int(x['mutS']>0)}-{int(x['mutL']>0)}-{int(x['mutY']>0)}" for _, x in mut.iterrows()]
mut['mut_label'] = [label_motif(x.mut_combination) for _, x in mut.iterrows()]
mut['mut_label_long'] = [label_motif(x.mut_combination, True) for _, x in mut.iterrows()]

# A string with the exact mutations 
mutation_detailed = {}
mm = mut_mutations.pivot_table(index="node",columns="mutation_name", values="present",aggfunc='sum').fillna(0)
mm = mm.loc[:,sorted(list(mm.columns))]
for k,v in mm.iterrows():
    mutation_detailed[k] = ", ".join([re.sub("(mut[SLY])",r"$\1$", name) for name,present in v.items() if present])
mut['mut_label_detailed'] = [mutation_detailed[k] if k in mutation_detailed else 'no $mut$ mutation' for k in mut.node]

print(f"{mut.mut_label.nunique()} simple patterns: {mut.mut_label.unique()}")
print(f"{mut.mut_label_long.nunique()} long patterns")
print(f"{mut.mut_label_detailed.nunique()} detailed patterns")

# Add metadata 
mut = pd.merge(trees.loc[:,['name','time','experiment']], mut, left_on='name',right_on='node')
mut.set_index('node')
mut
5 simple patterns: ['$mutS$' '$mutS$ and $mutY$' '$mutL$ and $mutS$' 'no $mut$ mutation'
 '$mutL$']
7 long patterns
18 detailed patterns
Out[8]:
name time experiment mutL mutM mutS mutY node number_of_mutations mut_combination mut_combination_simple mut_label mut_label_long mut_label_detailed
0 S-LCS+_1-I-a1 1 S-LCS+ 0.0 0.0 1.0 0.0 S-LCS+_1-I-a1 85 1.0-0.0-0.0 1-0-0 $mutS$ $mutS$ $mutS$ T497P
1 S-LCS+_1-I-a2 1 S-LCS+ 0.0 0.0 1.0 0.0 S-LCS+_1-I-a2 1 1.0-0.0-0.0 1-0-0 $mutS$ $mutS$ $mutS$ T497P
2 S-LCS+_1-I-a3 1 S-LCS+ 0.0 0.0 1.0 0.0 S-LCS+_1-I-a3 41 1.0-0.0-0.0 1-0-0 $mutS$ $mutS$ $mutS$ T497P
3 S-LCS+_1-I-a4 1 S-LCS+ 0.0 0.0 1.0 0.0 S-LCS+_1-I-a4 1 1.0-0.0-0.0 1-0-0 $mutS$ $mutS$ $mutS$ T497P
4 S-LCS+_1-I-a5 1 S-LCS+ 0.0 0.0 1.0 0.0 S-LCS+_1-I-a5 1 1.0-0.0-0.0 1-0-0 $mutS$ $mutS$ $mutS$ T497P
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3337 L-LCS-_10-II-F4 20 L-LCS- 1.0 0.0 1.0 0.0 L-LCS-_10-II-F4 293 1.0-1.0-0.0 1-1-0 $mutL$ and $mutS$ $mutL$ and $mutS$ $mutL$ L103R, $mutS$ R142R
3338 L-LCS-_10-II-F5 20 L-LCS- 1.0 0.0 0.0 0.0 L-LCS-_10-II-F5 301 0.0-1.0-0.0 0-1-0 $mutL$ $mutL$ $mutL$ L103R
3339 L-LCS-_10-II-F6 20 L-LCS- 1.0 0.0 0.0 0.0 L-LCS-_10-II-F6 259 0.0-1.0-0.0 0-1-0 $mutL$ $mutL$ $mutL$ L103R
3340 L-LCS-_10-II-F7 20 L-LCS- 1.0 0.0 0.0 0.0 L-LCS-_10-II-F7 267 0.0-1.0-0.0 0-1-0 $mutL$ $mutL$ $mutL$ L103R
3341 L-LCS-_10-II-F8 20 L-LCS- 1.0 0.0 0.0 0.0 L-LCS-_10-II-F8 303 0.0-1.0-0.0 0-1-0 $mutL$ $mutL$ $mutL$ L103R

3342 rows × 14 columns

In [9]:
marker = {'L-LCS+':'P', 'S-LCS+':"x", 'L-LCS-':"d", 'S-LCS-':"."}
cm = plt.cm.tab20c
colors_n = {'0-0-0': 16,
 '1-0-0': 8,
 '0-1-0': 4,
 '1-0-1': 0,
 '1-1-0': 12}
colors = {k:cm(v) for k,v in colors_n.items()}
#plt.subplots(1,1,figsize=(10,10))
#plt.plot()
#plt.legend(handles=[mpatches.Patch(color=c, label=l) for l,c in colors.items()])
In [10]:
color_detailed = {}
for ex, df in mut.groupby('experiment'):
    offset = {}
    for motif,name in sorted(df.groupby(['mut_combination_simple','mut_label_detailed']).count().index,
                            key=lambda x: (len(x[1]),x)):
        o = (offset[motif] if (motif in offset) else 0)
        cn = colors_n[motif]+o
        color_detailed[name] = cm(cn)
        
        if motif not in offset:
            offset[motif] = 1
        else:
            offset[motif] += 1
In [11]:
plt.rc('legend', fontsize=16)  
fig, ax = plt.subplots(1,1, figsize=(20,6), layout='constrained', sharex=True)
for (motif,treat), mm in mut.groupby(['mut_combination_simple','experiment']):
    ax.scatter(mm.time+0.6*(np.random.random(size=len(mm.time))-0.5),
               mm.number_of_mutations, 
               marker=marker[treat],
               color=colors[motif])
motifs = ["0-0-0","1-0-0", "1-0-1", "0-1-0", "1-1-0"]
labels = mut.loc[:,["mut_combination_simple","mut_label"]].drop_duplicates().set_index('mut_combination_simple').mut_label.to_dict()
patches = ([mpatches.Patch(color=colors[m], label=labels[m]) for m in motifs]+ 
           [ax.scatter([],[], marker=m, label=k, color='k') for k,m in marker.items()])

ax.legend(handles=patches, loc="upper left", ncols=2)

ax.set(
      xlim=(0.5,20.5),
      #ylim=(0,400),
      ylabel="Estimated Number of Mutations",
      xticks=np.arange(0,20)+1)
ax.set(xticklabels=[f"{1+i//2}-{'II' if i%2 else 'I'}" for i in range(20)])

fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_mut.png"), bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_mut.pdf"), bbox_inches='tight')
No description has been provided for this image
In [12]:
plt.rc('legend', fontsize=16)  
fig, ax = plt.subplots(1,1, figsize=(20,6), layout='constrained', sharex=True)
for treat, mm in mut.groupby('experiment'):
    ax.scatter(mm.time+0.6*(np.random.random(size=len(mm.time))-0.5),
               mm.number_of_mutations, 
               marker='.',
               color=COLORS[treat])
    
    std = mm.groupby('time').number_of_mutations.quantile()
    bot = mm.groupby('time').number_of_mutations.quantile(0.25)
    top = mm.groupby('time').number_of_mutations.quantile(0.75)
    x = mm.groupby('time').number_of_mutations.mean()
    c = mm.groupby('time').number_of_mutations.count()
        
    ax.plot(x[c>1].index, x[c>1].values, color=COLORS[treat], label=treat)
    ax.fill_between(x[c>1].index, bot[c>1].values, top[c>1].values, alpha=0.2, color=COLORS[treat])

ax.legend(loc="upper left")

ax.set(
      xlim=(0.5,20.5),
      #ylim=(0,400),
      ylabel="Estimated Number of Mutations",
      xticks=np.arange(0,20)+1)
ax.set(xticklabels=[f"{1+i//2}-{'II' if i%2 else 'I'}" for i in range(20)])

fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_exp.png"), bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_exp.pdf"), bbox_inches='tight')
No description has been provided for this image
In [13]:
def plot_tree(d, color_nodes,ax, marked=None, legend=None):
    tree, df = colgen.display.load_df(d)
    ypos = rack_layout(df)
    coal = colgen.display.coalescent(list(df.query('time==20 & extinct==0').name), tree['branches'])
    _, scales = colgen.display.draw_tree(tree['branches'],
                                    tree['xinfo'],
                                    ypos, 
                                    oinfo={d['name']:1 if d['name'] in coal else 1 for _,d in df.iterrows()},
                                    color=color_nodes,
                                    child_color_branch=True,
                                    ax=ax)
    if marked is None:
        marked = sequenced_nodes.intersection(frozenset(tree['xinfo'].keys()))
    stars = ax.scatter([scales[0](x) for x in marked],
                [scales[1](x) for x in marked],
                fc='w',ec='k',s=300,marker='s',label='Sequenced',zorder=-1)
    if legend is None:
        ax.legend(handles=patches+[stars], loc="lower left")
    else:
        ax.legend(handles=legend+[stars], loc="lower left")
    format_tree_labels(tree['xinfo'].keys(), ax, scales)
    return ypos, scales
In [14]:
mutLname = mutation_description.query("position==587260").name.values[0]
lg = [mpatches.Patch(color=colors[m], label=labels[m]) for m in ['0-0-0','0-1-0']]
using_all_sequencing = pd.read_csv("04_mutations/propagated/L-LCS-_mutation_mapping.csv")
color_nodes = {k:color_detailed['$mutL$ L103R'] if v else color_detailed['no $mut$ mutation']
               for k,v in 
               using_all_sequencing.set_index('name').loc[:,mutLname].items()}
fig,ax = plt.subplots(1,1, figsize=(12*1.5,9*1.5))
plot_tree(DATASETS[TREATMENTS.index("L-LCS-")], color_nodes, ax, legend=lg)
plt.savefig(os.path.join(EXPORT_PATH, f"mutL.pdf"), bbox_inches='tight')
plt.show()
No description has been provided for this image
In [15]:
endpoints = pd.read_csv("04_mutations/propagated/endpoints/L-LCS-_mutation_mapping.csv")
color_nodes = {k:color_detailed['$mutL$ L103R'] if v else color_detailed['no $mut$ mutation']
               for k,v in 
               endpoints.set_index('name').loc[:,mutLname].items()}
marked = sequenced_nodes.intersection(frozenset([x for x in endpoints['name'] if '10-II' in x]))
fig,ax = plt.subplots(1,1, figsize=(12*1.5,9*1.5))
plot_tree(DATASETS[TREATMENTS.index("L-LCS-")], color_nodes, ax, marked, legend=lg)
plt.savefig(os.path.join(EXPORT_PATH, f"mutL_endpoints.pdf"), bbox_inches='tight')
plt.show()
No description has been provided for this image
In [16]:
color_nodes = {k:color_detailed[v] if v in color_detailed else 'k' 
               for k,v in mut.set_index('node').mut_label_detailed.to_dict().items()}
for t,d in zip(TREATMENTS,DATASETS):
    fig,ax = plt.subplots(1,1, figsize=(12*1.5,9*1.5))
    
    to_label = mut.loc[mut.experiment==t,['mut_label','mut_label_detailed']].drop_duplicates()
    patches = [mpatches.Patch(color=color_detailed[r.mut_label_detailed],
                              label=r.mut_label_detailed) for _,r in to_label.iterrows()] 
    
    ax.legend(handles=patches, loc="lower left")
    ax.set_title(t, font={'size'   : 15, 'weight':'bold'})
    plot_tree(d, color_nodes, ax)
    plt.savefig(os.path.join(EXPORT_PATH, f"mut-{t}-tree.png"), bbox_inches='tight')
    plt.savefig(os.path.join(EXPORT_PATH, f"mut-{t}-tree.pdf"), bbox_inches='tight')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [17]:
for t,d in zip(TREATMENTS,DATASETS):
    ax = plt.figure(layout="constrained",
       figsize=(18,10)).subplot_mosaic("""AAAAAAB
                                         AAAAAAB""")    
    to_label = mut.loc[mut.experiment==t,['mut_label','mut_label_detailed']].drop_duplicates()
    patches = [mpatches.Patch(color=color_detailed[r.mut_label_detailed],
                              label=r.mut_label_detailed) for _,r in to_label.iterrows()] 
    
    #ax.legend(handles=patches, loc="lower left")
    
    ax['A'].legend(handles=patches, loc="lower left")
    ax['A'].set_title(t, font={'size'   : 15, 'weight':'bold'})
    ypos, scales = plot_tree(d, color_nodes, ax['A'])
    
    ax['B'].sharey(ax['A'])
    mi = np.min(list(ypos.values()))
    mx = np.max(list(ypos.values()))
    
    endpoints = [x for _,x in mut[mut.time==20].iterrows() if x.node in ypos]
    
    ax['B'].barh([scales[1](x.node) for x in endpoints],
                 [x.number_of_mutations for x in endpoints],
                  15, 
                 color=[color_nodes[x.node] if x.node in color_nodes else 'k' for x in endpoints])
    ax['B'].set_xlabel("Number of mutations")
    for x in endpoints:
        ax['B'].text(0,#x.number_of_mutations,
                     scales[1](x.node), 
                     x.mut_label_detailed,
                     color='k',
                     va='center')
        
    plt.savefig(os.path.join(EXPORT_PATH, f"nmut-{t}-tree.png"), bbox_inches='tight')
    plt.savefig(os.path.join(EXPORT_PATH, f"nmut-{t}-tree.pdf"), bbox_inches='tight')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [18]:
mut_type_data = []

groups =  [(frozenset(mut[['no $mut$ mutation'==x for x in mut.mut_label_detailed]].node), "No $mut$ mutation"),
           (frozenset(mut[['$mutS$ T497P'==x for x in mut.mut_label_detailed]].node), "$mutS$ T497P only"),
           (frozenset(mut[['$mutS$ T497P, $mutY$ G153D'==x for x in mut.mut_label_detailed]].node), "$mutS$ T497P and $mutY$ G153D")]

for nodes,label in groups:
    print(len(nodes.intersection(sequenced_nodes)), label)
    label += f" ({len(nodes.intersection(sequenced_nodes))} genotypes)"
    count = mutations[[x in nodes.intersection(sequenced_nodes) for x in mutations.node]].groupby('mutation_simple').count().mut_id
    count /= count.sum()
    count.name = label
    mut_type_data.append(count)
dd = pd.DataFrame(mut_type_data).transpose()
fig,ax = plt.subplots(1,1,figsize=(15,7.5))
dd.plot(kind='bar', ax=ax, color=[colors['0-0-0'],colors['1-0-0'],colors['1-0-1']])
ax.set(xlabel='Mutation',ylabel="Proportion")
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.savefig(os.path.join(EXPORT_PATH, "kind_of_mutations.pdf"), bbox_inches='tight')
plt.savefig(os.path.join(EXPORT_PATH, "kind_of_mutations.svg"), bbox_inches='tight')
40 No $mut$ mutation
84 $mutS$ T497P only
8 $mutS$ T497P and $mutY$ G153D
No description has been provided for this image
In [19]:
def plot_number(ax, ex, number):
    tree, df = colgen.display.load_df(f"01_genealogies/{ex}.csv")
    coal = colgen.display.coalescent(list(df.query('time==20 & extinct==0').index),
                             tree['branches'])
    ax, scales = colgen.display.draw_tree(tree['branches'],
                                          tree['xinfo'],
                                          rack_layout(df.reset_index()), 
                                          oinfo={name:1 if name in coal else 0.5 for name,d in df.iterrows()},
                                          color=number,
                                          colormap=True,
                                          child_color_branch=True,
                                          ax = ax)
    marked = sequenced_nodes.intersection(frozenset(tree['nodes']))
    stars = ax.scatter([scales[0](x) for x in marked],
                [scales[1](x) for x in marked],
                fc='w',ec='k',s=300,marker='s',label='Sequenced',zorder=-1)
    ax.legend(handles=[stars], loc="lower left")
    format_tree_labels(tree['xinfo'].keys(), ax, scales)
    ax.set_title(ex+" - Predicted number of mutations", font={'weight':'bold'})
In [20]:
ax = plt.figure(layout="constrained",
   figsize=(2*12*1.5,2*9*1.5)).subplot_mosaic("""AB
                             CD""")
fig = plt.gcf()
for ex,a in [("L-LCS-",'D'),("S-LCS-",'B'),("L-LCS+",'C'),("S-LCS+",'A')]:
    plot_number(ax[a], ex, mut.set_index('node').number_of_mutations.to_dict())
    
fig.savefig(os.path.join(EXPORT_PATH,'mutations_tree.png'), bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH,'mutations_tree.pdf'), bbox_inches='tight')
No description has been provided for this image