In [ ]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib_venn import venn3_unweighted as venn3

Extracting data

In [ ]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/idp-ego/GUI/algorithms/trio/"
data_dir = root_dir + 'data/'
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
In [ ]:
# path = Path(base_dir + 'data/bears')
# dest.mkdir(parents=True, exist_ok=True) I want only existing folders

e1=pd.read_csv(data_dir+'COL1538-V4-2.csv',low_memory=0)
e1.head(5)
Out[ ]:
chr pos ID refNt altNt function gene transcrptID is_CCDS is_RefSeq AAChange AnnoOther zygo BL gnomADGenomes_AF gnomADEx_AF ESP6500All_AF gnomADEx_AFR_AF gnomADEx_AMR_AF gnomADEx_EAS_AF gnomADEx_FIN_AF gnomADEx_NFE_AF gnomADEx_OTH_AF gnomADEx_SAS_AF MiddleEast_AF Sift_Pred Polyphen2HVAR_Pred CADD_Phred_V1.3 MSC_99% MSC_99%_Pred GDI varQual AD DP MQ QD Clinvar_AlleleID KnownPID PredictedPID P-value Route gnomADGen_SegDup
0 1 13273 rs531730856 G C nc_mirna DDX11L1 ENST00000456328 no no NaN intron(DDX11L1:ENST00000518655);nc_mirna(DDX11... het no 0.102748 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.42 NaN NaN NaN 190 13,11 24 25.1 7.9 NaN NO NO NaN NaN yes
1 1 13417 rs777038595 C CGAGA nc_mirna DDX11L1 ENST00000456328 no no NaN intron(DDX11L1:ENST00000450305);nc_mirna(DDX11... het no 0.066620 0.112528 NaN 0.015301 0.0 0.055929 0.175127 0.128360 0.134288 0.156572 NaN NaN NaN 4.01 NaN NaN NaN 17 10,2 12 27.8 1.2 NaN NO NO NaN NaN yes
2 1 13656 rs1263393206 CAG C essential_splicing DDX11L1 ENST00000518655 no no NaN nc_mirna(DDX11L1:ENST00000456328);nc_mirna(DDX... het no 0.489766 0.029533 NaN 0.006352 0.0 0.003094 0.061983 0.048217 0.030691 0.035543 NaN NaN NaN 5.77 NaN NaN NaN 35 4,2 6 23.5 5.8 NaN NO NO NaN NaN yes
3 1 15211 rs78601809 T G intron WASH7P ENST00000438504 no no NaN downstream(DDX11L1:ENST00000456328);downstream... hom no 0.609542 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.94 NaN NaN NaN 128 0,6 6 25.0 21.3 NaN NO NO NaN NaN yes
4 1 69511 rs75062661 A G missense OR4F5 ENST00000335137 yes yes p.Thr141Ala NaN hom no 0.843526 0.949691 0.240234 0.607496 0.0 0.999439 0.991577 0.972624 0.950621 0.985395 159:76:429 .,T B 0.05 2.31 LOW 2.32 1237 0,43 43 31.2 28.8 NaN NO NO NaN NaN yes
In [ ]:
e1[e1['chr']=='1'].head(5) #from 0 to 5-1
Out[ ]:
chr pos ID refNt altNt function gene transcrptID is_CCDS is_RefSeq AAChange AnnoOther zygo BL gnomADGenomes_AF gnomADEx_AF ESP6500All_AF gnomADEx_AFR_AF gnomADEx_AMR_AF gnomADEx_EAS_AF gnomADEx_FIN_AF gnomADEx_NFE_AF gnomADEx_OTH_AF gnomADEx_SAS_AF MiddleEast_AF Sift_Pred Polyphen2HVAR_Pred CADD_Phred_V1.3 MSC_99% MSC_99%_Pred GDI varQual AD DP MQ QD Clinvar_AlleleID KnownPID PredictedPID P-value Route gnomADGen_SegDup
0 1 13273 rs531730856 G C nc_mirna DDX11L1 ENST00000456328 no no NaN intron(DDX11L1:ENST00000518655);nc_mirna(DDX11... het no 0.102748 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.42 NaN NaN NaN 190 13,11 24 25.1 7.9 NaN NO NO NaN NaN yes
1 1 13417 rs777038595 C CGAGA nc_mirna DDX11L1 ENST00000456328 no no NaN intron(DDX11L1:ENST00000450305);nc_mirna(DDX11... het no 0.066620 0.112528 NaN 0.015301 0.0 0.055929 0.175127 0.128360 0.134288 0.156572 NaN NaN NaN 4.01 NaN NaN NaN 17 10,2 12 27.8 1.2 NaN NO NO NaN NaN yes
2 1 13656 rs1263393206 CAG C essential_splicing DDX11L1 ENST00000518655 no no NaN nc_mirna(DDX11L1:ENST00000456328);nc_mirna(DDX... het no 0.489766 0.029533 NaN 0.006352 0.0 0.003094 0.061983 0.048217 0.030691 0.035543 NaN NaN NaN 5.77 NaN NaN NaN 35 4,2 6 23.5 5.8 NaN NO NO NaN NaN yes
3 1 15211 rs78601809 T G intron WASH7P ENST00000438504 no no NaN downstream(DDX11L1:ENST00000456328);downstream... hom no 0.609542 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.94 NaN NaN NaN 128 0,6 6 25.0 21.3 NaN NO NO NaN NaN yes
4 1 69511 rs75062661 A G missense OR4F5 ENST00000335137 yes yes p.Thr141Ala NaN hom no 0.843526 0.949691 0.240234 0.607496 0.0 0.999439 0.991577 0.972624 0.950621 0.985395 159:76:429 .,T B 0.05 2.31 LOW 2.32 1237 0,43 43 31.2 28.8 NaN NO NO NaN NaN yes
In [ ]:
u=pd.DataFrame({'chr':['1','x','y'],'value':['val1','val2','val3']}) #example of a df creation
u
Out[ ]:
chr value
0 1 val1
1 x val2
2 y val3
In [ ]:
chr_e2=['1','2']
e2_filter=[x in chr_e2 for x in tuple(e1['chr'])] #search for list comprehension vs generator
#print(e2_filter)
e2=e1[e2_filter]
e2.tail(5)
Out[ ]:
chr pos ID refNt altNt function gene transcrptID is_CCDS is_RefSeq AAChange AnnoOther zygo BL gnomADGenomes_AF gnomADEx_AF ESP6500All_AF gnomADEx_AFR_AF gnomADEx_AMR_AF gnomADEx_EAS_AF gnomADEx_FIN_AF gnomADEx_NFE_AF gnomADEx_OTH_AF gnomADEx_SAS_AF MiddleEast_AF Sift_Pred Polyphen2HVAR_Pred CADD_Phred_V1.3 MSC_99% MSC_99%_Pred GDI varQual AD DP MQ QD Clinvar_AlleleID KnownPID PredictedPID P-value Route gnomADGen_SegDup
22502 2 242707101 rs6756901 A G intron D2HGDH ENST00000321264 yes yes NaN NaN het no 0.603751 0.518218 0.382489 0.871115 0.0 0.585605 0.470520 0.501298 0.519070 0.458265 NaN NaN NaN 5.22 0.09 HIGH 11.04 644 29,24 53 60.0 12.2 168071.0 NO NO NaN NaN no
22503 2 242707477 rs11552660 T C utr_3 D2HGDH ENST00000321264 yes yes NaN NaN het no 0.111900 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.44 0.09 HIGH 11.04 105 1,4 5 60.0 21.0 286302.0 NO NO NaN NaN no
22504 2 242708045 rs113372064 C T utr_3 D2HGDH ENST00000321264 yes yes NaN NaN hom no 0.073784 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.46 0.09 HIGH 11.04 52 0,2 2 60.0 25.9 286337.0 NO NO NaN NaN no
22505 2 242743651 rs744986 G A upstream AC114730.5 ENST00000437438 no no NaN downstream(GAL3ST2:ENST00000192314) het no 0.348165 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.33 NaN NaN NaN 145 3,5 8 60.0 18.1 NaN NO NO NaN NaN no
22506 2 242813124 rs13016102 T C intron CXXC11 ENST00000343216 yes yes NaN NaN hom no 0.989166 0.996225 NaN 0.970301 0.0 1.000000 0.995157 0.996347 0.996717 0.999911 NaN NaN NaN 1.94 2.31 LOW 9.35 1920 0,57 57 60.0 33.7 NaN NO NO NaN NaN no
In [ ]:
chr_e3=['1','12']
e3_filter=[x in chr_e3 for x in tuple(e1['chr'])] #generator list generator
#print(e3_filter)
e3=e1[e3_filter]
print(e3.columns)
e3.tail(5)
Index(['chr', 'pos', 'ID', 'refNt', 'altNt', 'function', 'gene', 'transcrptID',
       'is_CCDS', 'is_RefSeq', 'AAChange', 'AnnoOther', 'zygo', 'BL',
       'gnomADGenomes_AF', 'gnomADEx_AF', 'ESP6500All_AF', 'gnomADEx_AFR_AF',
       'gnomADEx_AMR_AF', 'gnomADEx_EAS_AF', 'gnomADEx_FIN_AF',
       'gnomADEx_NFE_AF', 'gnomADEx_OTH_AF', 'gnomADEx_SAS_AF',
       'MiddleEast_AF', 'Sift_Pred', 'Polyphen2HVAR_Pred', 'CADD_Phred_V1.3',
       'MSC_99%', 'MSC_99%_Pred', 'GDI', 'varQual', 'AD', 'DP', 'MQ', 'QD',
       'Clinvar_AlleleID', 'KnownPID', 'PredictedPID', 'P-value', 'Route',
       'gnomADGen_SegDup'],
      dtype='object')
Out[ ]:
chr pos ID refNt altNt function gene transcrptID is_CCDS is_RefSeq AAChange AnnoOther zygo BL gnomADGenomes_AF gnomADEx_AF ESP6500All_AF gnomADEx_AFR_AF gnomADEx_AMR_AF gnomADEx_EAS_AF gnomADEx_FIN_AF gnomADEx_NFE_AF gnomADEx_OTH_AF gnomADEx_SAS_AF MiddleEast_AF Sift_Pred Polyphen2HVAR_Pred CADD_Phred_V1.3 MSC_99% MSC_99%_Pred GDI varQual AD DP MQ QD Clinvar_AlleleID KnownPID PredictedPID P-value Route gnomADGen_SegDup
90071 12 133797649 rs147515371 T G intron ANHX ENST00000419717 yes yes NaN intron(ANHX:ENST00000545940);upstream(AC226150... hom no 0.755861 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.86 2.31 HIGH 11.48 1130 0,32 32 60.0 24.6 NaN NO NO NaN NaN no
90072 12 133803075 rs146938968 C A intron ANHX ENST00000419717 yes yes NaN intron(ANHX:ENST00000545940) het no 0.121501 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11.46 2.31 HIGH 11.48 506 12,16 28 60.0 17.4 NaN NO NO NaN NaN no
90073 12 133808270 rs142638257 C T intron ANHX ENST00000419717 yes yes NaN intron(ANHX:ENST00000545940) het no 0.057811 0.051146 NaN 0.041920 0.0 0.003437 0.081753 0.065813 0.060494 0.053098 NaN NaN NaN 6.49 2.31 HIGH 11.48 551 17,15 32 60.0 17.2 NaN NO NO NaN NaN no
90074 12 133808275 rs150558535 G C intron ANHX ENST00000419717 yes yes NaN intron(ANHX:ENST00000545940) het no 0.058017 0.051031 NaN 0.041947 0.0 0.003436 0.081648 0.065776 0.060614 0.053132 NaN NaN NaN 5.94 2.31 HIGH 11.48 557 13,15 28 60.0 19.9 NaN NO NO NaN NaN no
90075 12 133812415 rs141289920 C T utr_5 ANHX ENST00000545940 yes yes NaN upstream(ANHX:ENST00000419717);downstream(RNU6... hom no 0.800103 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.43 2.31 HIGH 11.48 22 0,2 2 60.0 10.9 NaN NO NO NaN NaN no

Subsets combinations

available sets: e1,e2,e3 (exoms)

possible subsets:

  • ${e_{_{i}}}\cap\ {e_{_{j}}}\cap\ {e_{_{k}}} \ |_{i,j,k\in \{A,B,C\}\ \wedge\ i\neq j\neq k}$
  • ${e_{i}}\cap\ {e_{j}}\cap \ {{e^c}_{k}} \ |_{i,j,k\in \{A,B,C\}\ \wedge\ i\neq j\neq k}$
  • ${e_{i}}\cap \ {{e^c}_{j}} \cap \ {{e^c}_{k}} \ |_{i,j,k\in \{A,B,C\}\ \wedge\ i\neq j\neq k}$

Figure 1. Venn diagram with C as the pacient, A as affected parent and B as the unaffected parent.

Requirements:

  • y
  • must match a triplet of (pos, ref nt, gene)
In [ ]:
apriori_common=len(set(list(e1[e1['chr']=='1']['gene'])))
apriori_unique_e2=len(set(list(e2[e2['chr']=='2']['gene'])))
apriori_unique_e3=len(set(list(e3[e3['chr']=='12']['gene'])))
print("len(apriori_common)={}".format(apriori_common))
print("len(apriori_unique_e2)={}".format(apriori_unique_e2))
print("len(apriori_unique_e3)={}".format(apriori_unique_e3))
len(apriori_common)=1973
len(apriori_unique_e2)=1286
len(apriori_unique_e3)=1040
In [ ]:
genes_e1=set(list(e1['gene']))
genes_e2=set(list(e2['gene']))
genes_e3=set(list(e3['gene']))

def searchforgene(gene,genes_set):
  positions=[]
  genes=[]
  for i in range(len(genes_set)):
    if gene==genes_set[i]:
      positions.append(i)
      genes.append(genes_set[i])
  return (positions,genes)
# test
print(list(e2['gene'])[:20])
print(len(list(e2['gene'])))
print(searchforgene('DDX11L1',list(genes_e2))[0])
print(searchforgene('DDX11L1',list(genes_e2))[1])
print(searchforgene('DDX11L1',list(e2['gene']))[0])
['DDX11L1', 'DDX11L1', 'DDX11L1', 'WASH7P', 'OR4F5', 'AL627309.1', 'RP11-206L10.10', 'RP11-206L10.10', 'LINC00115', 'LINC00115', 'LINC00115', 'LINC00115', 'LINC00115', 'LINC00115', 'LINC01128', 'LINC01128', 'LINC01128', 'LINC01128', 'FAM41C', 'TUBB8P11']
22507
[132]
['DDX11L1']
[0, 1, 2]
In [ ]:
common=genes_e1.intersection(genes_e2).intersection(genes_e3)
len(common)
Out[ ]:
1973

The result shown above, shows that len(a_priori_common) is equal to the z subset (see fig. 1).

Constructing the generic function

Selecting individual tuples from the exomes files : extract_tuples_pro function

In [ ]:
def extract_tuples_pro(e,prop=("gene","pos","refNt")):
    """
    Must extract the desired parameters from the exome "e"
    in a list of tuples(default triples)
    """
    return e[list(prop)].values
In [ ]:
def extract_tuples(e,prop=("gene","pos","refNt")):
    """
    Must extract the desired parameters from the exome "e"
    in a list of tuples(default triples)
    """
    r={}
    for i in range(len(prop)):
        r[prop[i]]=tuple(e[prop[i]])

    # rr[0]=("a_gen","a_pos","a_refNt")
    rr=[]
    for i in range(len(r[prop[0]])):
        rri=[r[j][i] for j in prop]#list comprehension
        rr.append(rri)           

    return r,rr

# test
print(extract_tuples(e2)[1][:5])
print(len(extract_tuples(e2)[0]["gene"]))
print(len(extract_tuples(e2)[0]["pos"]))
print(len(extract_tuples(e2)[0]["refNt"]))
[['DDX11L1', 13273, 'G'], ['DDX11L1', 13417, 'C'], ['DDX11L1', 13656, 'CAG'], ['WASH7P', 15211, 'T'], ['OR4F5', 69511, 'A']]
22507
22507
22507

Splitting by subsets : subset function

In [ ]:
def subset(Ax,Bx,Cx,prop=("gene","pos","refNt")):
    """
    Must obtain the different subsets to contruct the Venn
    diagram.
    A and B: parents
    C: Child
    """
    A=frozenset(tuple(m) for m in extract_tuples_pro(Ax,prop)) #as set 1
    B=frozenset(tuple(m) for m in extract_tuples_pro(Bx,prop)) #as set 2
    C=frozenset(tuple(m) for m in extract_tuples_pro(Cx,prop)) #as set 3

    a=((A-B)-C)             #001
    b=((B-A)-C)             #010
    c=((C-B)-A)             #100
    y=((A&C)-B)             #101
    w=((A&B)-C)             #011
    x=((B&C)-A)             #110
    z=A.intersection(B,C)   #111

    combinations=[a,b,w,c,y,x,z]
    merge_vector=[bin(i)[2:].zfill(3) for i in range (1,2**3)]
    r=dict(zip(merge_vector,combinations))
    return r
In [ ]:
# debug
merge_vector=[bin(i)[2:].zfill(3) for i in range (1,2**3)]
print(merge_vector)
['001', '010', '011', '100', '101', '110', '111']

Venn Diagram

In [ ]:
# Venn Diagramm - ilustrative example

A,B,C=set('23 xk678'),set('13 ykjl'),set('12 zkjl678')
Vtags=[A,B,C]

a=((A-B)-C)             #001
b=((B-A)-C)             #010
c=((C-B)-A)             #100
y=((A&C)-B)             #101
w=((A&B)-C)             #011
x=((B&C)-A)             #110
z=A.intersection(B,C)   #111


combinations=[len(i) for i in [a,b,w,c,y,x,z]]
print(combinations)
merge_vector=[bin(i)[2:].zfill(3) for i in range (1,2**3)]
r=dict(zip(merge_vector,combinations))
T=[a,b,w,c,y,x,z]
ñ=dict(zip(merge_vector,T))
print(r)
print(merge_vector)
#(Abc, aBc, ABc, abC, AbC, aBC, ABC)
tags=['a','b','w','c','y','x','z']
q=dict(zip(merge_vector,tags))
plt.figure(figsize=(10,8))
v=venn3(combinations,Vtags)
for i in merge_vector:
    try:
        t=v.get_label_by_id(i).get_text()
        v.get_label_by_id(i).set_text(f"id:{i}\n{q[i[::-1]]}:{t}\n{ñ[i[::-1]]}")
    except:
        print("debuging bro, testing")

plt.show()
[1, 1, 1, 1, 4, 3, 2]
{'001': 1, '010': 1, '011': 1, '100': 1, '101': 4, '110': 3, '111': 2}
['001', '010', '011', '100', '101', '110', '111']

Venn Function

In [ ]:
def venn(r,Vtags=('A','B','C')):
    plt.figure(figsize=(10,8))
    #(Abc, aBc, ABc, abC, AbC, aBC, ABC) <- format of venn3
    # str[::-1] is used to flip strings
    mv=[bin(i)[2:].zfill(3) for i in range (1,2**3)]
    tags=['a','b','w','c','y','x','z']
    q=dict(zip(mv,tags))
    v=venn3([len(r[i]) for i in mv],Vtags)
    for i in mv:
        t=v.get_label_by_id(i).get_text()
        v.get_label_by_id(i).set_text(f"id:{i}\n{q[i[::-1]]}\n{t}")
    plt.show()
    return None

Implementing and testing the function with sets

In [ ]:
# a_priori test
prop=("gene","pos","refNt")
apriori_common=len(e1[list(prop)])
print("len(apriori_common)={}".format(apriori_common))
print(e1[list(prop)].values[:5])
e1[list(prop)].head(5)
len(apriori_common)=138013
[['DDX11L1' 13273 'G']
 ['DDX11L1' 13417 'C']
 ['DDX11L1' 13656 'CAG']
 ['WASH7P' 15211 'T']
 ['OR4F5' 69511 'A']]
Out[ ]:
gene pos refNt
0 DDX11L1 13273 G
1 DDX11L1 13417 C
2 DDX11L1 13656 CAG
3 WASH7P 15211 T
4 OR4F5 69511 A
In [ ]:
A=frozenset(tuple(u) for u in extract_tuples(e1)[1])
B=frozenset(tuple(u) for u in extract_tuples(e2)[1])
C=frozenset(tuple(u) for u in extract_tuples(e3)[1])
z=A.intersection(B,C)
print(len(z))
y=len((A&C)-B)
print(y)
13076
7222
In [ ]:
# proving the triplets actually match in the analysed case (z)
aux1=e1[["chr","gene","pos","refNt"]]
print(type(aux1))
aux1=aux1[aux1['chr']=='1']
print(type(aux1))
print(len(aux1))
aux1=aux1[["gene","pos","refNt"]].values
aux1=frozenset(tuple(u) for u in aux1)
print(len(aux1))
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
13086
13076

Time test

In [ ]:
# time test
%timeit -n 10 -r 5 extract_tuples(e1)
10 loops, best of 5: 107 ms per loop
In [ ]:
%timeit -n 10 -r 5 extract_tuples_pro(e1)
10 loops, best of 5: 12.1 ms per loop

As a conclusion, the function extract_list_pro spent about 10% of the time spent by the extract_tuples function

Extending to the real case

In [ ]:
e1=pd.read_csv(data_dir+'COL1541-V4-2.csv',low_memory=0) #father
e2=pd.read_csv(data_dir+'COL1539-V4-2.csv',low_memory=0) #mother
e3=pd.read_csv(data_dir+'COL1538-V4-2.csv',low_memory=0) #child

Example 1: prop=("gene","pos","refNt") triplet

There is no need to call prop=("gene","pos","refNt") as this is a default argument of the subset function.

In [ ]:
r=subset(e1,e2,e3)
venn(r,Vtags=("Parent(A) : Symptomatic","Parent(B) : Asymptomatic","Child(C) : Symptomatic"))#217997

Data$\rightarrow$DataFrame

Because each subset is formed of unique tuples the DataFrame constructor must be used (then if desired, convert it to excel. Also to alocate the desired subset, the id displayed in the section Example 1 must be used as a key to access every element of the dictionary.

In [ ]:
prop=("gene","pos","refNt")
pd.DataFrame(r["111"],columns=prop).head(5)
Out[ ]:
gene pos refNt
0 HAUS6 19086979 C
1 CMAHP 25106745 A
2 CCDC88C 91751808 G
3 MTMR12 32242381 G
4 RHO 129247526 G

Testing

In [ ]:
print(f"len(r['101'])={len(r['101'])}")
print(f"len(r['101']&r['001'])={len(r['100'].intersection(r['001']))}")
len(r['101'])=26567
len(r['101']&r['001'])=0

Note that len(r['101']&r['001'])=0 makes sense due to the fact that the subsets are already separed

In [ ]:
# testing subset function with Venn diagram 'created' from scratch 
At=frozenset(tuple(m) for m in extract_tuples_pro(e1))
Bt=frozenset(tuple(m) for m in extract_tuples_pro(e2))
Ct=frozenset(tuple(m) for m in extract_tuples_pro(e3))
a=((At.difference(Bt)).difference(Ct))             #001
b=((Bt.difference(At)).difference(Ct))             #010
c=((Ct.difference(Bt)).difference(At))             #100
y=((At.intersection(Ct)).difference(Bt))           #101
w=((At.intersection(Bt)).difference(Ct))           #011
x=((Bt.intersection(Ct)).difference(At))           #110
z=At.intersection(Bt,Ct)                           #111
venn3(subsets=[len(i) for i in [a,b,w,c,y,x,z]])
Out[ ]:
<matplotlib_venn._common.VennDiagram at 0x7ff1ee45bd68>

Example 2: Custom prop parameter

Setting prop=("gene") will establish a different filter for the analysis

In [ ]:
r2=subset(e1,e2,e3,prop=("gene",))
venn(r2,Vtags=("Parent(A) : Symptomatic","Parent(B) : Asymptomatic","Child(C) : Symptomatic"))

Example 3: Custom prop parameter

Setting prop=("pos") will establish a different filter for the analysis

In [ ]:
r3=subset(e1,e2,e3,prop=("pos",))
venn(r3,Vtags=("Parent(A) : Symptomatic","Parent(B) : Asymptomatic","Child(C) : Symptomatic"))#217906

Example 4: Custom prop parameter

In [ ]:
r4=subset(e1,e2,e3,prop=("gene","pos","refNt","altNt"))
venn(r4,Vtags=("Parent(A) : Symptomatic","Parent(B) : Asymptomatic","Child(C) : Symptomatic"))#218853

Data $\rightarrow$ DataFrame

In [ ]:
prop2=("gene","pos","refNt","altNt")
pato=pd.DataFrame(r4["101"],columns=prop2).head(5)
pato
Out[ ]:
gene pos refNt altNt
0 ADCY8 131880292 T A
1 HDHD1 6976113 C T
2 ATP2B2 10400253 C T
3 LHB 49519855 T G
4 NRK 105132142 C A
In [ ]:
pato.to_csv()
Out[ ]:
',gene,pos,refNt,altNt\n0,ADCY8,131880292,T,A\n1,HDHD1,6976113,C,T\n2,ATP2B2,10400253,C,T\n3,LHB,49519855,T,G\n4,NRK,105132142,C,A\n'

Application: Comparison between exomes analysis

In [ ]:
#loading files
A=pd.read_csv(root_dir+'data-us/COL1541.V4.2.gnomGN_0.05.csv',low_memory=0) #father
B=pd.read_csv(root_dir+'data-us/COL1539.V4.2.gnomGN_0.05.csv',low_memory=0) #mother
C=pd.read_csv(root_dir+'data-us/COL1538.V4.2.gnomGN_0.05.csv',low_memory=0) #child
#plotting venn
prop_=("gene","pos","refNt","altNt")#illustrative
QQ=subset(A,B,C,prop=prop_)
venn(QQ,Vtags=("Parent A (COL1541)","Parent B (COL1539)","Child C (COL1538)"))

Data $\rightarrow$ DataFrame

In [ ]:
duck=pd.DataFrame(QQ["011"],columns=prop_).head(10)
duck
Out[ ]:
gene pos refNt altNt
0 PTGER4P2 66499841 T C
1 HLA-DQB1 32633009 T C
2 ZNF596 194612 A T
3 ZNF766 52786700 CT C
4 BRWD1 40636351 C T
5 NaN 10793861 A G
6 HLA-DRB5 32497885 C T
7 LINC00273 33965672 C T
8 TRBV6-1 142028106 C G
9 CDC42BPG 64594099 T G