21 September 2010

Trees in Mongodb, my notebook with Gene Ontology

In the current post I've loaded the Gene Ontology into MongoDB and played with the tree structure of the database:

Loading GeneOntology into MongoDB

First, download GO as RDF at http://archive.geneontology.org/latest-termdb/go_daily-termdb.rdf-xml.gz and transform it with my XSLT stylesheet go2mongo.xsl (available here):
<?xml version='1.0' encoding="UTF-8" ?>
<xsl:stylesheet
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
xmlns:go="http://www.geneontology.org/dtds/go.dtd#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
version='1.0'
>
<xsl:output method="text"/>

<xsl:param name="colName">go</xsl:param>

<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>

<xsl:template match="go:go">
<xsl:apply-templates select="rdf:RDF"/>
</xsl:template>

<xsl:template match="rdf:RDF">

db.<xsl:value-of select="$colName"/>.drop();

<xsl:apply-templates select="go:term"/>


</xsl:template>

<xsl:template match="go:term">
<xsl:text>term={_id:</xsl:text><xsl:apply-templates select="go:accession" mode="text"/>
<xsl:text>,name:</xsl:text><xsl:apply-templates select="go:name" mode="text"/>
<xsl:if test="go:synonym">
<xsl:text>,synonyms:[</xsl:text>
<xsl:for-each select="go:synonym">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:definition">
<xsl:text>,definition:</xsl:text>
<xsl:apply-templates select="go:definition" mode="text"/>
</xsl:if>

<xsl:if test="go:comment">
<xsl:text>,comments:[</xsl:text>
<xsl:for-each select="go:comment">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:part_of">
<xsl:text>,part_of:[</xsl:text>
<xsl:for-each select="go:part_of">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:is_a">
<xsl:text>,is_a:[</xsl:text>
<xsl:for-each select="go:is_a">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:negatively_regulates">
<xsl:text>,negatively_regulates:[</xsl:text>
<xsl:for-each select="go:negatively_regulates">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:positively_regulates">
<xsl:text>,positively_regulates:[</xsl:text>
<xsl:for-each select="go:positively_regulates">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:regulates">
<xsl:text>,regulates:[</xsl:text>
<xsl:for-each select="go:regulates">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:dbxref">
<xsl:text>,dbxrefs:[</xsl:text>
<xsl:for-each select="go:dbxref">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="."/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:association">
<xsl:text>,associations:[</xsl:text>
<xsl:for-each select="go:association">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:text>{evidences:[</xsl:text>
<xsl:for-each select="go:evidence">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="." mode="text"/>
</xsl:for-each>
<xsl:text>],gene_product:{name:</xsl:text>
<xsl:apply-templates select="go:gene_product/go:name" mode="text"/>
<xsl:text>,dbxref:</xsl:text>
<xsl:apply-templates select="go:gene_product/go:dbxref" />
<xsl:text>}}</xsl:text>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>

<xsl:if test="go:is_obsolete">
<xsl:text>,is_obsolete:[</xsl:text>
<xsl:for-each select="go:is_obsolete">
<xsl:if test="position()!=1"><xsl:text>,</xsl:text></xsl:if>
<xsl:apply-templates select="@rdf:resource"/>
</xsl:for-each>
<xsl:text>]</xsl:text>
</xsl:if>
<xsl:text>};
db.</xsl:text>
<xsl:value-of select="$colName"/>
<xsl:text>.save(term);
</xsl:text>
</xsl:template>

<xsl:template match="go:dbxref">
<xsl:text>{database_symbol:</xsl:text>
<xsl:apply-templates select="go:database_symbol" mode="text"/>
<xsl:text>,reference:</xsl:text>
<xsl:apply-templates select="go:reference" mode="text"/>
<xsl:text>}</xsl:text>
</xsl:template>

<xsl:template match="*" mode="text">
<xsl:text>&quot;</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="."/>
</xsl:call-template>
<xsl:text>&quot;</xsl:text>
</xsl:template>

<xsl:template match="@rdf:resource">
<xsl:text>{&apos;$ref&apos;:&apos;</xsl:text>
<xsl:value-of select="$colName"/>
<xsl:text>&apos;,&apos;$id&apos;:&apos;</xsl:text>
<xsl:value-of select="substring-after(.,'#')"/>
<xsl:text>&apos;}</xsl:text>
</xsl:template>


<xsl:template name="escape">
<xsl:param name="s"/>
<xsl:choose>
<xsl:when test="contains($s,'&quot;')">
<xsl:value-of select="substring-before($s,'&quot;')"/>
<xsl:text>\&quot;</xsl:text>
<xsl:call-template name="escape">
<xsl:with-param name="s" select="substring-after($s,'&quot;')"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$s"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>



</xsl:stylesheet>


unzip and transform 'go_daily-termdb.rdf-xml' with the stylesheet to generate the javascript:
xsltproc go2mongo.xsl go_daily-termdb.rdf-xml > input.js
The file input.js looks like this:
term={
_id:"GO:0000001",
name:"mitochondrion inheritance",
synonyms:["mitochondrial inheritance"],
definition:"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton.",
is_a:[
{'$ref':'go','$id':'GO:0048308'},
{'$ref':'go','$id':'GO:0048311'}
]
};
db.go.save(term);
term={_id:"GO:0000002",name:"mitochondrial genome maintenance",definition:"The maintenance of the structure and integrity of the mitochondrial genome; includes replication and segregation of the mitochondrial chromosome.",is_a:[{'$ref':'go','$id':'GO:0007005'}],dbxrefs:[{database_symbol:"InterPro",reference:"IPR009446"},{database_symbol:"Pfam",reference:"PF06420"}]};
db.go.save(term);
term={_id:"GO:0000003",name:"reproduction",synonyms:["GO:0019952","GO:0050876","reproductive physiological process"],definition:"The production by an organism of new individuals that contain some portion of their genetic material inherited from that organism.",is_a:[{'$ref':'go','$id':'GO:0008150'}],dbxrefs:[{database_symbol:"Wikipedia",reference:"Reproduction"}]};
db.go.save(term);
term={_id:"GO:0000005",name:"ribosomal chaperone activity",definition:"OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function.",comments:["This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function."],is_a:[{'$ref':'go','$id':'obsolete_molecular_function'}]};
db.go.save(term);
term={_id:"GO:0042254",name:"ribosome biogenesis",synonyms:["GO:0007046","ribosomal chaperone activity","ribosome biogenesis and assembly"],definition:"The process of the formation of the constituents of the ribosome subunits, their assembly, and their transport to the sites of protein synthesis.",is_a:[{'$ref':'go','$id':'GO:0022613'}],dbxrefs:[{database_symbol:"InterPro",reference:"IPR001790"},{database_symbol:"InterPro",reference:"IPR004037"},{database_symbol:"InterPro",reference:"IPR007023"},{database_symbol:"InterPro",reference:"IPR012948"},{database_symbol:"SP_KW",reference:"KW-0690"},{database_symbol:"HAMAP",reference:"MF_00554"},{database_symbol:"HAMAP",reference:"MF_00699"},{database_symbol:"HAMAP",reference:"MF_00803"},{database_symbol:"HAMAP",reference:"MF_01852"},{database_symbol:"Pfam",reference:"PF00466"},{database_symbol:"Pfam",reference:"PF04939"},{database_symbol:"Pfam",reference:"PF08142"},{database_symbol:"PROSITE",reference:"PS01082"},{database_symbol:"Wikipedia",reference:"Ribosome_biogenesis"},{database_symbol:"SMART",reference:"SM00785"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00436"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR01575"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR02729"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03594"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03596"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03597"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03598"}]};
db.go.save(term);
term={_id:"GO:0044183",name:"protein binding involved in protein folding",synonyms:["chaperone activity"],definition:"Interacting selectively and non-covalently with any protein or protein complex (a complex of two or more proteins that may include other nonprotein molecules) that contributes to the process of protein folding.",is_a:[{'$ref':'go','$id':'GO:0005515'}]};
db.go.save(term);
term={_id:"GO:0051082",name:"unfolded protein binding",synonyms:["binding unfolded ER proteins","chaperone activity","fimbrium-specific chaperone activity","glycoprotein-specific chaperone activity","histone-specific chaperone activity","ribosomal chaperone activity","tubulin-specific chaperone activity"],definition:"Interacting selectively and non-covalently with an unfolded protein.",is_a:[{'$ref':'go','$id':'GO:0005515'}],dbxrefs:[{database_symbol:"InterPro",reference:"IPR000397"},{database_symbol:"InterPro",reference:"IPR001305"},{database_symbol:"InterPro",reference:"IPR001404"},{database_symbol:"InterPro",reference:"IPR002194"},{database_symbol:"InterPro",reference:"IPR002777"},{database_symbol:"InterPro",reference:"IPR002939"},{database_symbol:"InterPro",reference:"IPR003095"},{database_symbol:"InterPro",reference:"IPR003708"},{database_symbol:"InterPro",reference:"IPR004127"},{database_symbol:"InterPro",reference:"IPR004226"},{database_symbol:"InterPro",reference:"IPR004487"},{database_symbol:"InterPro",reference:"IPR004961"},{database_symbol:"InterPro",reference:"IPR008971"},{database_symbol:"InterPro",reference:"IPR009033"},{database_symbol:"InterPro",reference:"IPR009169"},{database_symbol:"InterPro",reference:"IPR010236"},{database_symbol:"InterPro",reference:"IPR011599"},{database_symbol:"InterPro",reference:"IPR012713"},{database_symbol:"InterPro",reference:"IPR012714"},{database_symbol:"InterPro",reference:"IPR012715"},{database_symbol:"InterPro",reference:"IPR012716"},{database_symbol:"InterPro",reference:"IPR012717"},{database_symbol:"InterPro",reference:"IPR012718"},{database_symbol:"InterPro",reference:"IPR012719"},{database_symbol:"InterPro",reference:"IPR012720"},{database_symbol:"InterPro",reference:"IPR012721"},{database_symbol:"InterPro",reference:"IPR012722"},{database_symbol:"InterPro",reference:"IPR012724"},{database_symbol:"InterPro",reference:"IPR012725"},{database_symbol:"InterPro",reference:"IPR016153"},{database_symbol:"InterPro",reference:"IPR016154"},{database_symbol:"InterPro",reference:"IPR019805"},{database_symbol:"HAMAP",reference:"MF_00117"},{database_symbol:"PROSITE",reference:"MF_00117"},{database_symbol:"HAMAP",reference:"MF_00175"},{database_symbol:"PROSITE",reference:"MF_00175"},{database_symbol:"HAMAP",reference:"MF_00307"},{database_symbol:"PROSITE",reference:"MF_00307"},{database_symbol:"HAMAP",reference:"MF_00308"},{database_symbol:"PROSITE",reference:"MF_00308"},{database_symbol:"PROSITE",reference:"MF_00332"},{database_symbol:"HAMAP",reference:"MF_00505"},{database_symbol:"PROSITE",reference:"MF_00505"},{database_symbol:"HAMAP",reference:"MF_00600"},{database_symbol:"PROSITE",reference:"MF_00679"},{database_symbol:"HAMAP",reference:"MF_00790"},{database_symbol:"PROSITE",reference:"MF_00821"},{database_symbol:"HAMAP",reference:"MF_00822"},{database_symbol:"HAMAP",reference:"MF_01046"},{database_symbol:"HAMAP",reference:"MF_01152"},{database_symbol:"PROSITE",reference:"MF_01152"},{database_symbol:"HAMAP",reference:"MF_01183"},{database_symbol:"ProDom",reference:"PD010430"},{database_symbol:"Pfam",reference:"PF00684"},{database_symbol:"Pfam",reference:"PF01430"},{database_symbol:"Pfam",reference:"PF01556"},{database_symbol:"Pfam",reference:"PF01920"},{database_symbol:"Pfam",reference:"PF02556"},{database_symbol:"Pfam",reference:"PF02970"},{database_symbol:"Pfam",reference:"PF02996"},{database_symbol:"Pfam",reference:"PF03280"},{database_symbol:"PIRSF",reference:"PIRSF002356"},{database_symbol:"PIRSF",reference:"PIRSF002583"},{database_symbol:"PIRSF",reference:"PIRSF005261"},{database_symbol:"PRINTS",reference:"PR00625"},{database_symbol:"PRINTS",reference:"PR01594"},{database_symbol:"PROSITE",reference:"PS00298"},{database_symbol:"PROSITE",reference:"PS00750"},{database_symbol:"PROSITE",reference:"PS00751"},{database_symbol:"PROSITE",reference:"PS00995"},{database_symbol:"PROSITE",reference:"PS51188"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00074"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00115"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00382"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR00809"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR02350"},{database_symbol:"JCVI_TIGRFAMS",reference:"TIGR03142"}]};
db.go.save(term);
term={_id:"GO:0000006",name:"high affinity zinc uptake transmembrane transporter activity",definition:"Catalysis of the transfer of a solute or solutes from one side of a membrane to the other according to the reaction: Zn2+(out) = Zn2+(in), probably powered by proton motive force. In high affinity transport the transporter is able to bind the solute even if it is only present at very low concentrations.",is_a:[{'$ref':'go','$id':'GO:0005385'}]};
db.go.save(term);
(...)
Here the notation {'$ref':'go','$id':'GO:0048308'} is a special object interpreted by mongo as a "Database Reference", a kind of forein-key/link/pointer to another document with a special method named 'fetch' retrieving the linked document.

Load 'input.js' into mongo
mongo mygodatabase input.js

Playing with the GeneOntology Tree

I'm going to look if a go-term is a descendant of one another. First, let's define two useful javascript recursive functions looking for the parent(s) of a given node threw the property is_a.
var goNodeIsA= function (childNode, parentId) {
if (childNode == null) {
return false;
}
if (childNode._id == parentId) {
return true;
}
if (!childNode.is_a) {
return false;
}
for (var i = 0; i < childNode.is_a.length; ++i) {
if (goNodeIsA(childNode.is_a[i].fetch(), parentId))
{
return true;
}
}
return false;
}

var goIsA=function (childId, parentId)
{
return goNodeIsA(db.go.findOne({_id:childId}), parentId);
}

Now, let's find if GO:0003723 (RNA binding) is a descendant of GO:0005488 (binding) ?
> goIsA("GO:0003723","GO:0005488");
true

And is GO:0003723 (RNA binding) is a descendant of GO:0050355 (triphosphatase activity) ?
> goIsA("GO:0003723","GO:0050355");
false

Loop over all the GO terms and find the descendants of GO:0050355 (triphosphatase activity):
> db.go.find({},{name:1,is_a:1}).forEach(function(term) { if(goIsA(term._id,'GO:0005488')) printjson(term); })

(...)
{
"_id" : "GO:0080084",
"name" : "5S rDNA binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0000182"
}
]
}
{
"_id" : "GO:0080087",
"name" : "callose binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0030247"
}
]
}
{
"_id" : "GO:0080115",
"name" : "myosin XI tail binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0032029"
}
]
}
{
"_id" : "GO:0090079",
"name" : "translation regulator activity, nucleic acid binding",
"is_a" : [
{
"$ref" : "go",
"$id" : "GO:0003676"
},
{
"$ref" : "go",
"$id" : "GO:0045182"
}
]
}
(...)




That's it

Pierre

No comments: