In [1]:
import glob
import os
import json
from io import StringIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
from utils import TREATMENTS, COLORS, DATASETS
genealogies = pd.concat([pd.read_csv(x) for x in DATASETS])
PATH_IMPORT = "raw/24-03-22/**/summary.json"
ignore = [
"/Breseq/WT_s/WT_a7/WT_e7/data", # Duplicated data
"/Breseq/17_s/Line_17/data", # Duplicated data
"/Breseq/PX-10/", # Unrelated data
"/Breseq/PE-5-WT-L-E3", # Low coverage
"/Breseq/PE-4-WT-L-E3" # Re-sequenced
]
EXPORT_PATH = '04_mutations'
EXPORT_TREE_PATH = '04_mutations/trees'
if not os.path.exists(EXPORT_PATH):
os.mkdir(EXPORT_PATH)
if not os.path.exists(EXPORT_TREE_PATH):
os.mkdir(EXPORT_TREE_PATH)
In [2]:
def read_breseq_summary(file):
js = json.load(open(file,'r'))
assert len(js['references']['reference'].keys()) == 1
ref = list(js['references']['reference'].keys())[0]
return dict(folder=file.replace('/data/summary.json','').replace('raw/24-03-22/',''),
name=file.split("/")[-3],
ref=ref,
coverage_ref=js['references']['reference'][ref]['coverage_average'],
fraction_aligned_reads=js['reads']['total_fraction_aligned_reads'],
n_reads=js['reads']['total_reads'],
read_files = ", ".join(js['reads']['read_file'].keys()),
n_read_files=len(js['reads']['read_file'].keys()))
def read_breseq_mutation_table(file):
with open(file, 'r') as fh:
html = fh.read()
read = pd.read_html(StringIO(html.split('<!--Output Html_Mutation_Table_String-->')[1].split('</body>')[0]))[0]
read.columns = [x[1] for x in read.columns]
read['file'] = file
return read
def parse_PE(x):
try:
x = x.replace('-','_')
splitted = x.split('_')
if x.startswith("Line_17"):
geno = "LCS-" if "mutS" in x else "LCS+"
return(dict(pe=x,
cycle=0, size="",
ancestor=geno, treatment="",
tube="", genotype=f"{geno} ancestor"))
if "PE" in x and "Line" not in x:
cycle = int(splitted[1])+1
large = splitted[-1][0].upper() == splitted[-1][0]
tube = splitted[-1]
geno = 'LCS-' if 'WT_' in x else 'LCS+'
size = 'L' if large else 'S'
try:
name = f"{size}-{geno}_{cycle}-I-{tube}"
parent = genealogies.query(f"name=='{name}'").parent.values[0].replace("LLCS_ROOT","LCS- ancestor")
except Exception:
parent = 'UNK'
id_geno = f"{parent.replace('_',':')}→{cycle}-I-{tube}"
elif splitted[0] in ("WT","17"):
cycle = 11
large = splitted[-1][0].upper() == splitted[-1][0]
size = 'L' if large else 'S'
tube = splitted[-1]
geno = {'WT':'LCS-',"17":"LCS+"}[splitted[0]]
id_geno = f"{size}-{geno}: 10-II-{tube}→"
parent = f"{size}-{geno}_10-II-{tube}"
else:
return {"pe":x, "microcosm":x}
exp = size + '-' + geno
return(dict(pe=x,
cycle=cycle, size=size,
parent=parent,
ancestor=geno, treatment=exp,
tube=tube, genotype=id_geno))
except Exception as ex:
return {'error':ex, "pe":x}
In [3]:
metadata = []
data = []
for s in glob.glob(PATH_IMPORT, recursive=True):
if not any([x in s for x in ignore]):
metadata.append(read_breseq_summary(s))
metadata[-1].update(parse_PE(metadata[-1]['name'].replace('org','')))
h = s.replace("data/summary.json", 'output/index.html')
data.append(read_breseq_mutation_table(h))
try:
data[-1]['genotype'] = metadata[-1]['genotype']
except:
print(metadata[-1])
raise
else:
print("Ignored:", s)
metadata = pd.DataFrame(metadata)
data = pd.concat(data)
Ignored: raw/24-03-22/Breseq/PE-4-WT-L-E3org/data/summary.json Ignored: raw/24-03-22/Breseq/PX-10/PX_17_a6/data/summary.json Ignored: raw/24-03-22/Breseq/PX-10/PX_17_b3/data/summary.json Ignored: raw/24-03-22/Breseq/PX-10/PX_17_c3/data/summary.json Ignored: raw/24-03-22/Breseq/PX-10/PX_17_e5/data/summary.json
Ignored: raw/24-03-22/Breseq/17_s/Line_17/data/summary.json
Ignored: raw/24-03-22/Breseq/WT_s/WT_a7/WT_e7/data/summary.json
Ignored: raw/24-03-22/Breseq/PE-4-WT-L-E3/data/summary.json Ignored: raw/24-03-22/Breseq/PE-5-WT-L-E3/data/summary.json
In [4]:
# We consider that two mutations are the same if
# ["position","mutation","annotation","gene","description"] are equal.
# Two entries with the same values in those columns but different evidence are
# considered the same mutation.
# Example: At position 1354000 there is a Δ9 bp deletion in wspB that is sometimes detected
# with JC sometimes with MC and JC
data.fillna({"annotation":''}, inplace=True)
cols = ["position","mutation","annotation","gene","description"]
count = data.groupby(cols).count().sort_values('genotype').genotype
count.name = "time_sequenced"
mutations = pd.pivot(data, index=cols, columns='genotype', values='evidence').fillna(0)
mutations['time_sequenced'] = count
mutations = mutations[sorted(mutations.columns, reverse=True)].reset_index()
mutations.columns.name = ''
mutations['mut_id'] = [f"MUT_{x:04}" for x in range(mutations.shape[0])]
mutations['name'] = mutations['mut_id']
# Checks that the mutations are unique
assert mutations.shape[0] == mutations.loc[:,cols].drop_duplicates().shape[0]
In [5]:
lcsp_ancestor = set(x.mut_id for k,x in mutations.iterrows() if x['LCS+ ancestor'])
lcsm_ancestor = set(x.mut_id for k,x in mutations.iterrows() if x['LCS- ancestor'])
all_sequences = set(mutations[mutations.time_sequenced==data.file.nunique()].mut_id)
any_ancestors = set.union(lcsm_ancestor, lcsp_ancestor)
both_ancestors = set.intersection(lcsm_ancestor, lcsp_ancestor)
print(f"There are:\n\t {mutations.shape[0]} unique mutations")
print(f"\t{len(lcsp_ancestor)} in LCS+ ancestor")
print(f"\t{len(lcsm_ancestor)} in LCS- ancestor")
print(f"\t{len(all_sequences)} in all sequencing")
print(f"\t{len(any_ancestors)} in any of the two ancestors")
print(f"\t{len(both_ancestors)} in any of the two ancestors")
There are: 9271 unique mutations 58 in LCS+ ancestor 74 in LCS- ancestor 28 in all sequencing 78 in any of the two ancestors 54 in any of the two ancestors
In [6]:
# Count the number of mutations in metadata
cmut = data.groupby('genotype').count().mutation
metadata.set_index('genotype',inplace=True)
metadata["n_mutations"] = cmut
In [7]:
mut_mutations = mutations[['mut' in x.gene for k,x in mutations.iterrows()]].copy()
mut_mutations['mutation_name'] = [x.gene.split()[0]+' '+x.annotation.split()[0] for k,x in mut_mutations.iterrows()]
col_mut = cols+["time_sequenced"]
mut_mutations = mut_mutations.drop(columns=col_mut).set_index('mutation_name').drop(columns=['mut_id','name']).transpose()
In [8]:
# EXPORT
metadata.to_csv(os.path.join(EXPORT_PATH, "sequenced_cultures.csv"))
mutation_description = mutations.loc[:, ['mut_id', 'position', 'mutation',
'annotation', 'gene', 'description',
'time_sequenced', 'name']].set_index('mut_id')
mutation_description.to_csv(os.path.join(EXPORT_PATH,"mutation_description.csv"))
long_table = pd.merge(data, mutations.loc[:,cols+['mut_id']], left_on=cols, right_on=cols, )
long_table.to_csv(os.path.join(EXPORT_PATH,"sequencing_long_table.csv"))
wide_table = mutations.set_index('mut_id')
wide_table.to_csv(os.path.join(EXPORT_PATH,"sequencing_wide_table.csv"))
mut_mutations.to_csv(os.path.join(EXPORT_PATH,"mut_mutations_wide.csv"))
all_data = pd.merge(long_table, metadata, left_on="genotype", right_index=True)
for treat, df in all_data.groupby('treatment'):
if treat:
d = df.loc[:, ['parent', 'mut_id', 'cycle']].rename(columns={"parent":"node", "mut_id":"mutation"})
anc = f"{treat.split('-',1)[1]} ancestor"
r = all_data[all_data.genotype==anc].loc[:, ['genotype', 'mut_id']].rename(columns={"genotype":"node", "mut_id":"mutation"})
r.replace(anc, treat+"_"+"ROOT", inplace=True)
out = pd.concat([d,r]).reset_index(drop=True).fillna({'cycle':0})
out['endpoint'] = [c == 11 or c == 0 for c in out.cycle]
out.loc[:, ['node', 'mutation']].drop_duplicates().to_csv(os.path.join(EXPORT_TREE_PATH,treat+"_mutations.csv"))
out.loc[out.endpoint==True, ['node', 'mutation']].drop_duplicates().to_csv(os.path.join(EXPORT_TREE_PATH,treat+"_mutations_endpoints.csv"))
In [9]:
import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
metadata.drop(columns=['cycle','size','ancestor','treatment','tube','name','pe']).to_sql("sequenced_cultures", database, if_exists="replace", dtype={'genotype':"TEXT PRIMARY KEY"})
mutation_description.drop(columns=['name']).to_sql("mutations", database, if_exists="replace", dtype={'mut_id':"TEXT PRIMARY KEY"})
long_table.loc[:,["evidence","mut_id","genotype"]].to_sql("sequencing", database, if_exists="replace", index=False)
In [10]:
fig, ax = plt.subplots(1,1, figsize=(7,4))
for k, v in metadata.groupby(["ancestor", "size"]):
ax.scatter(v.cycle-(1 if k[1] else 0)+(np.random.random(v.shape[0])-0.5)*0.8, v.n_mutations,
label=f"{k[1]}-{k[0]}" if k[1] else k[0]+' ancestor',
color=COLORS[f"{k[1]}-{k[0]}"] if k[1] else 'k',
marker='.' if k[1] else {"LCS+":"x","LCS-":"+"}[k[0]])
ax.legend()
ax.set(xlabel='Cycle',
xlim=(-0.5,10.5),
ylim=(0,420),
ylabel="Number of mutations identified",
xticks=np.arange(11))
for x in np.arange(11)+0.5:
plt.axvline(x,color='k',ls='--', lw=1)
fig.savefig(os.path.join(EXPORT_PATH, 'number_of_mutations.svg'),bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH, 'number_of_mutations.pdf'),bbox_inches='tight')
Filtering out the mutations from the ancestors¶
In [11]:
filtered_out = long_table.copy()
print(f"{filtered_out.shape[0]} entries")
for anc,anc_name in (["LCS+ ancestor", "LCS+"], ["LCS- ancestor", "LCS-"]):
anc_dt = frozenset(filtered_out[filtered_out.genotype==anc].mut_id)
genotypes = frozenset(metadata[metadata.ancestor==anc_name].index)
print(f"Ancestor {anc}: {len(anc_dt)} mutations to be filtered out of {len(genotypes)} datasets")
idx = np.logical_not(np.logical_and([x in genotypes for x in filtered_out.genotype],
[x in anc_dt for x in filtered_out.mut_id]))
filtered_out = filtered_out.loc[idx]
print(filtered_out.shape[0])
nmut2 = filtered_out.groupby("genotype").count().mut_id
metadata2 = metadata.copy()
metadata2['n_mutations'] = nmut2
fig, ax = plt.subplots(1,1, figsize=(7,4))
for k, v in metadata2.groupby(["ancestor", "size"]):
if k[1]:
ax.scatter(v.cycle-(1 if k[1] else 0)+(np.random.random(v.shape[0])-0.5)*0.8, v.n_mutations,
label=f"{k[1]}-{k[0]}" if k[1] else k[0]+' ancestor',
color=COLORS[f"{k[1]}-{k[0]}"] if k[1] else 'k',
marker='.' if k[1] else {"LCS+":"x","LCS-":"+"}[k[0]])
ax.legend()
ax.set(xlabel='Cycle',
xlim=(-0.5,10.5),
#ylim=(-1,420),
ylabel="Number of mutations identified",
xticks=np.arange(11))
for x in np.arange(11)+0.5:
plt.axvline(x,color='k',ls='--', lw=1)
plt.axhline(0,color='k',ls='--', lw=1)
fig.savefig(os.path.join(EXPORT_PATH, 'filtered_number_of_mutations.svg'),bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH, 'filtered_number_of_mutations.pdf'),bbox_inches='tight')
37819 entries Ancestor LCS+ ancestor: 58 mutations to be filtered out of 108 datasets 32252 Ancestor LCS- ancestor: 74 mutations to be filtered out of 90 datasets 25965
A table for all sequencing data¶
In [12]:
cat = {'folder': 'Breseq', 'name': 'ID',
'ref': 'Reference', 'coverage_ref': 'Reference',
'fraction_aligned_reads': 'Reads', 'n_reads': 'Reads',
'read_files': 'Reads', 'n_read_files': 'Reads',
'cycle': 'ID', 'size': 'ID', 'ancestor': 'ID', 'treatment': 'ID',
'tube': 'ID', 'genotype': 'ID', 'n_mutations':'Mutations'}
metadisp = metadata.copy().reset_index()
metadisp = pd.merge(metadisp, mut_mutations, left_on='genotype',right_index=True)
cols_mut = list(mut_mutations.columns)
metadisp = metadisp[["name","genotype","treatment",
"n_mutations",
"n_read_files", "n_reads", "fraction_aligned_reads",
"ref","coverage_ref"]+cols_mut ]
metadisp.set_index('genotype',inplace=True)
metadisp.sort_values('n_mutations', inplace=True)
metadisp.columns = pd.MultiIndex.from_tuples([(cat[x], x)
if x in cat
else (x.split()[0], x)
for x in metadisp.columns
])
C0 = matplotlib.colors.to_hex('C0')
border = [{'selector': 'th', 'props': 'border-left: 1px solid black'},
{'selector': 'td', 'props': 'border-left: 1px solid black'}]
html = metadisp.style.set_table_styles({
('ID','name'): border
}, overwrite=False, axis=0)\
.set_sticky(axis="index")\
.set_sticky(axis="columns")\
.bar(subset=[('Reference','coverage_ref'), ('Reads','fraction_aligned_reads'),('Reads',"n_reads"),('Mutations',"n_mutations")], color=matplotlib.colors.to_hex('C0'))\
.map(lambda x: (f"background-color:{matplotlib.colors.to_hex(COLORS[x])}" if x in COLORS else ''), subset=[('ID','treatment')])\
.map(lambda x: (f"background-color:{C0};color:{C0};border-color:black" if x else 'color:white;border-color:black'),
subset=[(x.split()[0],x) for x in cols_mut])\
.map_index(lambda v: "max-width:3em;font-size:0.8em;" if 'mut' in v and len(v.split())>1 else '', axis=1)\
.to_html()
with open(os.path.join(EXPORT_PATH,'sequencing.html'),'w') as f:
f.write("<html><head><meta charset=\"utf-8\"><title>Sequencing Data</title><style> table, th, td {border: 1px solid} table{border-collapse: collapse;} thead{background-color:white;} </style></head><body>")
f.write(html)
f.write("</body>")
In [13]:
key_col = lambda x: ((0,0) if ("ancestor" in x or 'ROOT' in x)
else (0,-1) if ("name" in x or "All" in x)
else ({'+':0,'-':1}[x[5]],{'S':0,'L':1}[x[0]], int(x.split(":")[1].split('-')[0])))
common_mut = mutations.set_index('mut_id').loc[list(any_ancestors),:].reset_index()
common_mut['All Sequencing'] = [m in all_sequences for m in common_mut.mut_id]
common_mut = common_mut.set_index(cols+['time_sequenced','mut_id']).drop(columns='name')
common_columns = sorted(list(common_mut.columns), key=key_col)
common_mut = common_mut.loc[:,common_columns].reset_index()
common_mut = common_mut.sort_values(['All Sequencing','LCS- ancestor', 'LCS+ ancestor','time_sequenced'], ascending=False).reset_index(drop=True)
html = common_mut.style.set_sticky(axis="index")\
.map(lambda x: (f"background-color:{C0};color:{C0};border-color:black" if x else 'color:white;border-color:black'),
subset=common_columns)\
.set_sticky(axis="columns")\
.to_html()
with open(os.path.join(EXPORT_PATH,'common_mutations.html'),'w') as f:
f.write("<html><head><meta charset=\"utf-8\"><title>Sequencing Data</title><style> table, th, td {border: 1px solid} table{border-collapse: collapse;} thead{background-color:white;} </style></head><body>")
f.write(html)
f.write("</body>")
In [14]:
m_mut = mutations[['mut' in x for x in mutations.gene]].copy()
m_mut['mutation_name'] = [x.gene.split()[0]+' '+x.annotation.split()[0] for k,x in m_mut.iterrows()]
m_mut = m_mut.set_index(cols+['time_sequenced','mut_id',]).drop(columns=['name'])
sorted_cols = sorted(list(m_mut.columns), key=key_col)
m_mut = m_mut.loc[:,sorted_cols].reset_index().drop(columns='mut_id').set_index(cols+['mutation_name','time_sequenced'])
print("Latex Table:")
print(80*"-")
print(m_mut.reset_index().loc[:,cols].rename(columns=lambda x: x.capitalize().replace("_"," ").replace("Time", r"\# times")).set_index('Gene').to_latex().replace('_',r"\_"))
print(80*"-")
Latex Table: -------------------------------------------------------------------------------- \begin{tabular}{lrlll} \toprule & Position & Mutation & Annotation & Description \\ Gene & & & & \\ \midrule mutY ← & 355688 & C→T & G153D (GGC→GAC) & A/G‑specific adenine glycosylase \\ mutL → & 586966 & C→T & S5L (TCG→TTG) & DNA mismatch repair protein \\ mutL → & 587050 & A→G & E33G (GAG→GGG) & DNA mismatch repair protein \\ mutL → & 587260 & T→G & L103R (CTG→CGG) & DNA mismatch repair protein \\ mutL → & 587555 & C→T & L201L (CTC→CTT) & DNA mismatch repair protein \\ mutL → & 587737 & A→G & N262S (AAC→AGC) & DNA mismatch repair protein \\ mutL → & 588126 & (G)5→6 & coding (1174/1902 nt) & DNA mismatch repair protein \\ mutL → & 588551 & C→T & A533A (GCC→GCT) & DNA mismatch repair protein \\ mutS ← & 1300356 & C→T & G707S (GGC→AGC) & DNA mismatch repair protein \\ mutS ← & 1300673 & +A & coding (1802/2592 nt) & DNA mismatch repair protein \\ mutS ← & 1300783 & A→G & R564R (CGT→CGC) & DNA mismatch repair protein \\ mutS ← & 1300986 & T→G & T497P (ACC→CCC) & DNA mismatch repair protein \\ mutS ← & 1301392 & A→G & R361R (CGT→CGC) & DNA mismatch repair protein \\ mutS ← & 1302049 & A→G & R142R (CGT→CGC) & DNA mismatch repair protein \\ mutS ← & 1302080 & T→C & N132S (AAC→AGC) & DNA mismatch repair protein \\ mutS ← / ← PFLU\_1165 & 1302576 & G→A & intergenic (‑102/+68) & DNA mismatch repair protein/putative membrane protein \\ mutM ← & 6343361 & A→G & S46P (TCC→CCC) & formamidopyrimidine‑DNA glycosylase \\ \bottomrule \end{tabular} --------------------------------------------------------------------------------
In [15]:
m_mut.sort_values('time_sequenced',ascending=False, inplace=True)
yticks = [f"{x[5]:>15} ({x[6]})" for x in m_mut.index]
xticks = [x for x in m_mut.columns]
fig, ax = plt.subplots(1,1, figsize=(30,5))
ax.matshow(m_mut.values!=0, cmap="bone_r")
ax.set(yticks=range(len(yticks)),
yticklabels=yticks,
xticks=range(len(xticks)),
xticklabels=xticks);
ax.set_xticklabels(xticks, rotation=90);
fig.savefig(os.path.join(EXPORT_PATH,"sequenced_mut_mutations.pdf"), bbox_inches='tight')
In [ ]: