In [1]:
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
import colgen.display
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
import re
SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 25
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
In [2]:
# Prepare export
from utils import DATASETS, format_tree_labels, COLORS, rack_layout, TREATMENTS
EXPORT_PATH = '05_mutation_propagation'
if not os.path.exists(EXPORT_PATH):
os.mkdir(EXPORT_PATH)
In [3]:
# Load DATA
ssq = pd.read_csv("04_mutations/sequenced_cultures.csv")
mutation_description = pd.read_csv("04_mutations/mutation_description.csv", index_col=0)
trees = pd.concat([pd.read_csv(k) for k in DATASETS])
sequenced_nodes = frozenset(ssq.parent.unique())
# Read the propagated data from Colgen
all_data = []
for f in glob.glob("04_mutations/propagated/*mapping*.csv"):
m = pd.read_csv(f).set_index('name')
m = m[[x for x in m.columns if ("MUT" in x)]]
m = m.melt(var_name="mut_id", ignore_index=False, value_name="present")
all_data.append(m[m.present==1])
mutation_longtable = pd.concat(all_data)
# Add the description of mutations
mutations = pd.merge(mutation_longtable.reset_index(),
mutation_description.drop(columns="name").reset_index(),
how='left')
### FILTERING ###
sequenced = pd.read_csv("04_mutations/sequencing_long_table.csv", index_col=0)
mutations.rename(columns={'name':'node'}, inplace=True)
# Export all_propagated_mutation_data
mutations.to_csv(os.path.join(EXPORT_PATH, "all_propagated_mutation_data.csv"))
import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
mutations.loc[:, ["mut_id","node"]].rename(columns={'node':'name'}).to_sql("propagated_mutations", database, index=False, if_exists="replace",)
In [4]:
mutations['treatment'] = [x.split('_')[0] for x in mutations.node]
mutations['background'] = [x.split('-',1)[1] for x in mutations.treatment]
mut_mutation_description = mutation_description[['mut' in x.gene for k,x in mutation_description.iterrows()]]
never_filter_out = frozenset(mut_mutation_description.name)
filtered = []
for bg, dt in mutations.groupby('background'):
to_filter_out = frozenset(sequenced[sequenced.genotype==f"{bg} ancestor"].mut_id)
to_filter_out = to_filter_out-never_filter_out
filtered.append(dt[[x not in to_filter_out for x in dt.mut_id]])
filtered = pd.concat(filtered)
# Now we proceed with the filtered dataset
mutations = filtered
In [5]:
# Keep only the mut ,mutations
mut_mutations = mutations[['mut' in x.gene for k,x in mutations.iterrows()]].copy()
mut_mutations['gene_name'] = [x.split()[0] for x in mut_mutations.gene]
mut_mutations['mutation_name'] = [x.gene.split()[0]+' '+x.annotation.split()[0] for k,x in mut_mutations.iterrows()]
print(f"{mut_mutations.shape[0]} mutations identified in genes {mut_mutations.gene_name.unique()}")
2869 mutations identified in genes ['mutY' 'mutL' 'mutS' 'mutM']
In [6]:
def simplify(x):
if x[0] == "(" and x[2] == ")":
return f"Track of {x[1]}"
if x[0] == "(":
return "Motif #"
if x[0] == "Δ":
return "Deletion"
if 'bp' in x:
return "bp → XX"
return x
mutations['mutation_simple'] = [simplify(x) for x in mutations.mutation]
In [7]:
def label_motif(u, number=False):
if number:
par = lambda x: f" ({x} mutations)" if x>1 else ""
else:
par = lambda x: ''
muts,mutl,muty = [int(float(x)) for x in u.split('-')]
s = []
if mutl:
s.append("$mutL$"+par(mutl))
if muts:
s.append("$mutS$"+par(muts))
if muty:
s.append("$mutY$"+par(muty))
if not s:
s =['no $mut$ mutation']
return " and ".join(s)
In [8]:
# Compute the mut-patterns
mut_mutations['present'] = 1
mut = mut_mutations.pivot_table(index="node",columns="gene_name", values="present",aggfunc='sum').fillna(0)
# Add the number of mutations and the nodes without mut-mutations
count_mutations = mutations.groupby('node').count().rename(columns={'mut_id':'number_of_mutations'}).loc[:,"number_of_mutations"].reset_index()
mut = mut.astype(int)
mut = pd.merge(mut, count_mutations, how='right',left_index=True, right_on="node").fillna(0)
mut['mut_combination'] = [f"{x['mutS']}-{x['mutL']}-{x['mutY']}" for _, x in mut.iterrows()]
mut['mut_combination_simple'] = [f"{int(x['mutS']>0)}-{int(x['mutL']>0)}-{int(x['mutY']>0)}" for _, x in mut.iterrows()]
mut['mut_label'] = [label_motif(x.mut_combination) for _, x in mut.iterrows()]
mut['mut_label_long'] = [label_motif(x.mut_combination, True) for _, x in mut.iterrows()]
# A string with the exact mutations
mutation_detailed = {}
mm = mut_mutations.pivot_table(index="node",columns="mutation_name", values="present",aggfunc='sum').fillna(0)
mm = mm.loc[:,sorted(list(mm.columns))]
for k,v in mm.iterrows():
mutation_detailed[k] = ", ".join([re.sub("(mut[SLY])",r"$\1$", name) for name,present in v.items() if present])
mut['mut_label_detailed'] = [mutation_detailed[k] if k in mutation_detailed else 'no $mut$ mutation' for k in mut.node]
print(f"{mut.mut_label.nunique()} simple patterns: {mut.mut_label.unique()}")
print(f"{mut.mut_label_long.nunique()} long patterns")
print(f"{mut.mut_label_detailed.nunique()} detailed patterns")
# Add metadata
mut = pd.merge(trees.loc[:,['name','time','experiment']], mut, left_on='name',right_on='node')
mut.set_index('node')
mut
5 simple patterns: ['$mutS$' '$mutS$ and $mutY$' '$mutL$ and $mutS$' 'no $mut$ mutation' '$mutL$'] 7 long patterns 18 detailed patterns
Out[8]:
name | time | experiment | mutL | mutM | mutS | mutY | node | number_of_mutations | mut_combination | mut_combination_simple | mut_label | mut_label_long | mut_label_detailed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | S-LCS+_1-I-a1 | 1 | S-LCS+ | 0.0 | 0.0 | 1.0 | 0.0 | S-LCS+_1-I-a1 | 85 | 1.0-0.0-0.0 | 1-0-0 | $mutS$ | $mutS$ | $mutS$ T497P |
1 | S-LCS+_1-I-a2 | 1 | S-LCS+ | 0.0 | 0.0 | 1.0 | 0.0 | S-LCS+_1-I-a2 | 1 | 1.0-0.0-0.0 | 1-0-0 | $mutS$ | $mutS$ | $mutS$ T497P |
2 | S-LCS+_1-I-a3 | 1 | S-LCS+ | 0.0 | 0.0 | 1.0 | 0.0 | S-LCS+_1-I-a3 | 41 | 1.0-0.0-0.0 | 1-0-0 | $mutS$ | $mutS$ | $mutS$ T497P |
3 | S-LCS+_1-I-a4 | 1 | S-LCS+ | 0.0 | 0.0 | 1.0 | 0.0 | S-LCS+_1-I-a4 | 1 | 1.0-0.0-0.0 | 1-0-0 | $mutS$ | $mutS$ | $mutS$ T497P |
4 | S-LCS+_1-I-a5 | 1 | S-LCS+ | 0.0 | 0.0 | 1.0 | 0.0 | S-LCS+_1-I-a5 | 1 | 1.0-0.0-0.0 | 1-0-0 | $mutS$ | $mutS$ | $mutS$ T497P |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3337 | L-LCS-_10-II-F4 | 20 | L-LCS- | 1.0 | 0.0 | 1.0 | 0.0 | L-LCS-_10-II-F4 | 293 | 1.0-1.0-0.0 | 1-1-0 | $mutL$ and $mutS$ | $mutL$ and $mutS$ | $mutL$ L103R, $mutS$ R142R |
3338 | L-LCS-_10-II-F5 | 20 | L-LCS- | 1.0 | 0.0 | 0.0 | 0.0 | L-LCS-_10-II-F5 | 301 | 0.0-1.0-0.0 | 0-1-0 | $mutL$ | $mutL$ | $mutL$ L103R |
3339 | L-LCS-_10-II-F6 | 20 | L-LCS- | 1.0 | 0.0 | 0.0 | 0.0 | L-LCS-_10-II-F6 | 259 | 0.0-1.0-0.0 | 0-1-0 | $mutL$ | $mutL$ | $mutL$ L103R |
3340 | L-LCS-_10-II-F7 | 20 | L-LCS- | 1.0 | 0.0 | 0.0 | 0.0 | L-LCS-_10-II-F7 | 267 | 0.0-1.0-0.0 | 0-1-0 | $mutL$ | $mutL$ | $mutL$ L103R |
3341 | L-LCS-_10-II-F8 | 20 | L-LCS- | 1.0 | 0.0 | 0.0 | 0.0 | L-LCS-_10-II-F8 | 303 | 0.0-1.0-0.0 | 0-1-0 | $mutL$ | $mutL$ | $mutL$ L103R |
3342 rows × 14 columns
In [9]:
marker = {'L-LCS+':'P', 'S-LCS+':"x", 'L-LCS-':"d", 'S-LCS-':"."}
cm = plt.cm.tab20c
colors_n = {'0-0-0': 16,
'1-0-0': 8,
'0-1-0': 4,
'1-0-1': 0,
'1-1-0': 12}
colors = {k:cm(v) for k,v in colors_n.items()}
#plt.subplots(1,1,figsize=(10,10))
#plt.plot()
#plt.legend(handles=[mpatches.Patch(color=c, label=l) for l,c in colors.items()])
In [10]:
color_detailed = {}
for ex, df in mut.groupby('experiment'):
offset = {}
for motif,name in sorted(df.groupby(['mut_combination_simple','mut_label_detailed']).count().index,
key=lambda x: (len(x[1]),x)):
o = (offset[motif] if (motif in offset) else 0)
cn = colors_n[motif]+o
color_detailed[name] = cm(cn)
if motif not in offset:
offset[motif] = 1
else:
offset[motif] += 1
In [11]:
plt.rc('legend', fontsize=16)
fig, ax = plt.subplots(1,1, figsize=(20,6), layout='constrained', sharex=True)
for (motif,treat), mm in mut.groupby(['mut_combination_simple','experiment']):
ax.scatter(mm.time+0.6*(np.random.random(size=len(mm.time))-0.5),
mm.number_of_mutations,
marker=marker[treat],
color=colors[motif])
motifs = ["0-0-0","1-0-0", "1-0-1", "0-1-0", "1-1-0"]
labels = mut.loc[:,["mut_combination_simple","mut_label"]].drop_duplicates().set_index('mut_combination_simple').mut_label.to_dict()
patches = ([mpatches.Patch(color=colors[m], label=labels[m]) for m in motifs]+
[ax.scatter([],[], marker=m, label=k, color='k') for k,m in marker.items()])
ax.legend(handles=patches, loc="upper left", ncols=2)
ax.set(
xlim=(0.5,20.5),
#ylim=(0,400),
ylabel="Estimated Number of Mutations",
xticks=np.arange(0,20)+1)
ax.set(xticklabels=[f"{1+i//2}-{'II' if i%2 else 'I'}" for i in range(20)])
fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_mut.png"), bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_mut.pdf"), bbox_inches='tight')
In [12]:
plt.rc('legend', fontsize=16)
fig, ax = plt.subplots(1,1, figsize=(20,6), layout='constrained', sharex=True)
for treat, mm in mut.groupby('experiment'):
ax.scatter(mm.time+0.6*(np.random.random(size=len(mm.time))-0.5),
mm.number_of_mutations,
marker='.',
color=COLORS[treat])
std = mm.groupby('time').number_of_mutations.quantile()
bot = mm.groupby('time').number_of_mutations.quantile(0.25)
top = mm.groupby('time').number_of_mutations.quantile(0.75)
x = mm.groupby('time').number_of_mutations.mean()
c = mm.groupby('time').number_of_mutations.count()
ax.plot(x[c>1].index, x[c>1].values, color=COLORS[treat], label=treat)
ax.fill_between(x[c>1].index, bot[c>1].values, top[c>1].values, alpha=0.2, color=COLORS[treat])
ax.legend(loc="upper left")
ax.set(
xlim=(0.5,20.5),
#ylim=(0,400),
ylabel="Estimated Number of Mutations",
xticks=np.arange(0,20)+1)
ax.set(xticklabels=[f"{1+i//2}-{'II' if i%2 else 'I'}" for i in range(20)])
fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_exp.png"), bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH,"nmut_colored_by_exp.pdf"), bbox_inches='tight')
In [13]:
def plot_tree(d, color_nodes,ax, marked=None, legend=None):
tree, df = colgen.display.load_df(d)
ypos = rack_layout(df)
coal = colgen.display.coalescent(list(df.query('time==20 & extinct==0').name), tree['branches'])
_, scales = colgen.display.draw_tree(tree['branches'],
tree['xinfo'],
ypos,
oinfo={d['name']:1 if d['name'] in coal else 1 for _,d in df.iterrows()},
color=color_nodes,
child_color_branch=True,
ax=ax)
if marked is None:
marked = sequenced_nodes.intersection(frozenset(tree['xinfo'].keys()))
stars = ax.scatter([scales[0](x) for x in marked],
[scales[1](x) for x in marked],
fc='w',ec='k',s=300,marker='s',label='Sequenced',zorder=-1)
if legend is None:
ax.legend(handles=patches+[stars], loc="lower left")
else:
ax.legend(handles=legend+[stars], loc="lower left")
format_tree_labels(tree['xinfo'].keys(), ax, scales)
return ypos, scales
In [14]:
mutLname = mutation_description.query("position==587260").name.values[0]
lg = [mpatches.Patch(color=colors[m], label=labels[m]) for m in ['0-0-0','0-1-0']]
using_all_sequencing = pd.read_csv("04_mutations/propagated/L-LCS-_mutation_mapping.csv")
color_nodes = {k:color_detailed['$mutL$ L103R'] if v else color_detailed['no $mut$ mutation']
for k,v in
using_all_sequencing.set_index('name').loc[:,mutLname].items()}
fig,ax = plt.subplots(1,1, figsize=(12*1.5,9*1.5))
plot_tree(DATASETS[TREATMENTS.index("L-LCS-")], color_nodes, ax, legend=lg)
plt.savefig(os.path.join(EXPORT_PATH, f"mutL.pdf"), bbox_inches='tight')
plt.show()
In [15]:
endpoints = pd.read_csv("04_mutations/propagated/endpoints/L-LCS-_mutation_mapping.csv")
color_nodes = {k:color_detailed['$mutL$ L103R'] if v else color_detailed['no $mut$ mutation']
for k,v in
endpoints.set_index('name').loc[:,mutLname].items()}
marked = sequenced_nodes.intersection(frozenset([x for x in endpoints['name'] if '10-II' in x]))
fig,ax = plt.subplots(1,1, figsize=(12*1.5,9*1.5))
plot_tree(DATASETS[TREATMENTS.index("L-LCS-")], color_nodes, ax, marked, legend=lg)
plt.savefig(os.path.join(EXPORT_PATH, f"mutL_endpoints.pdf"), bbox_inches='tight')
plt.show()
In [16]:
color_nodes = {k:color_detailed[v] if v in color_detailed else 'k'
for k,v in mut.set_index('node').mut_label_detailed.to_dict().items()}
for t,d in zip(TREATMENTS,DATASETS):
fig,ax = plt.subplots(1,1, figsize=(12*1.5,9*1.5))
to_label = mut.loc[mut.experiment==t,['mut_label','mut_label_detailed']].drop_duplicates()
patches = [mpatches.Patch(color=color_detailed[r.mut_label_detailed],
label=r.mut_label_detailed) for _,r in to_label.iterrows()]
ax.legend(handles=patches, loc="lower left")
ax.set_title(t, font={'size' : 15, 'weight':'bold'})
plot_tree(d, color_nodes, ax)
plt.savefig(os.path.join(EXPORT_PATH, f"mut-{t}-tree.png"), bbox_inches='tight')
plt.savefig(os.path.join(EXPORT_PATH, f"mut-{t}-tree.pdf"), bbox_inches='tight')
plt.show()
In [17]:
for t,d in zip(TREATMENTS,DATASETS):
ax = plt.figure(layout="constrained",
figsize=(18,10)).subplot_mosaic("""AAAAAAB
AAAAAAB""")
to_label = mut.loc[mut.experiment==t,['mut_label','mut_label_detailed']].drop_duplicates()
patches = [mpatches.Patch(color=color_detailed[r.mut_label_detailed],
label=r.mut_label_detailed) for _,r in to_label.iterrows()]
#ax.legend(handles=patches, loc="lower left")
ax['A'].legend(handles=patches, loc="lower left")
ax['A'].set_title(t, font={'size' : 15, 'weight':'bold'})
ypos, scales = plot_tree(d, color_nodes, ax['A'])
ax['B'].sharey(ax['A'])
mi = np.min(list(ypos.values()))
mx = np.max(list(ypos.values()))
endpoints = [x for _,x in mut[mut.time==20].iterrows() if x.node in ypos]
ax['B'].barh([scales[1](x.node) for x in endpoints],
[x.number_of_mutations for x in endpoints],
15,
color=[color_nodes[x.node] if x.node in color_nodes else 'k' for x in endpoints])
ax['B'].set_xlabel("Number of mutations")
for x in endpoints:
ax['B'].text(0,#x.number_of_mutations,
scales[1](x.node),
x.mut_label_detailed,
color='k',
va='center')
plt.savefig(os.path.join(EXPORT_PATH, f"nmut-{t}-tree.png"), bbox_inches='tight')
plt.savefig(os.path.join(EXPORT_PATH, f"nmut-{t}-tree.pdf"), bbox_inches='tight')
plt.show()
In [18]:
mut_type_data = []
groups = [(frozenset(mut[['no $mut$ mutation'==x for x in mut.mut_label_detailed]].node), "No $mut$ mutation"),
(frozenset(mut[['$mutS$ T497P'==x for x in mut.mut_label_detailed]].node), "$mutS$ T497P only"),
(frozenset(mut[['$mutS$ T497P, $mutY$ G153D'==x for x in mut.mut_label_detailed]].node), "$mutS$ T497P and $mutY$ G153D")]
for nodes,label in groups:
print(len(nodes.intersection(sequenced_nodes)), label)
label += f" ({len(nodes.intersection(sequenced_nodes))} genotypes)"
count = mutations[[x in nodes.intersection(sequenced_nodes) for x in mutations.node]].groupby('mutation_simple').count().mut_id
count /= count.sum()
count.name = label
mut_type_data.append(count)
dd = pd.DataFrame(mut_type_data).transpose()
fig,ax = plt.subplots(1,1,figsize=(15,7.5))
dd.plot(kind='bar', ax=ax, color=[colors['0-0-0'],colors['1-0-0'],colors['1-0-1']])
ax.set(xlabel='Mutation',ylabel="Proportion")
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.savefig(os.path.join(EXPORT_PATH, "kind_of_mutations.pdf"), bbox_inches='tight')
plt.savefig(os.path.join(EXPORT_PATH, "kind_of_mutations.svg"), bbox_inches='tight')
40 No $mut$ mutation
84 $mutS$ T497P only
8 $mutS$ T497P and $mutY$ G153D
In [19]:
def plot_number(ax, ex, number):
tree, df = colgen.display.load_df(f"01_genealogies/{ex}.csv")
coal = colgen.display.coalescent(list(df.query('time==20 & extinct==0').index),
tree['branches'])
ax, scales = colgen.display.draw_tree(tree['branches'],
tree['xinfo'],
rack_layout(df.reset_index()),
oinfo={name:1 if name in coal else 0.5 for name,d in df.iterrows()},
color=number,
colormap=True,
child_color_branch=True,
ax = ax)
marked = sequenced_nodes.intersection(frozenset(tree['nodes']))
stars = ax.scatter([scales[0](x) for x in marked],
[scales[1](x) for x in marked],
fc='w',ec='k',s=300,marker='s',label='Sequenced',zorder=-1)
ax.legend(handles=[stars], loc="lower left")
format_tree_labels(tree['xinfo'].keys(), ax, scales)
ax.set_title(ex+" - Predicted number of mutations", font={'weight':'bold'})
In [20]:
ax = plt.figure(layout="constrained",
figsize=(2*12*1.5,2*9*1.5)).subplot_mosaic("""AB
CD""")
fig = plt.gcf()
for ex,a in [("L-LCS-",'D'),("S-LCS-",'B'),("L-LCS+",'C'),("S-LCS+",'A')]:
plot_number(ax[a], ex, mut.set_index('node').number_of_mutations.to_dict())
fig.savefig(os.path.join(EXPORT_PATH,'mutations_tree.png'), bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH,'mutations_tree.pdf'), bbox_inches='tight')