Search this keyword

Viewing phylogenies on the web: Javascript conversion of Newick tree to SVG

Quick test of an idea I'm playing with. By embedding a Newick-format tree description in HTML and adding some Javascript I can go from this:
<div class="newick" data-drawing-type="circlephylogram">((((((((219923430:0.046474,219923429:0.009145):0.037428,219923426:0.038397):0.015434,(219923419:0.022612,219923420:0.015561):0.050529):0.004828,(207366059:0.020922,207366058:0.016958):0.038734):0.003901,219923422:0.072942):0.005414,((219923443:0.038239,219923444:0.025617):0.037592,(219923423:0.056081,219923421:0.055808):0.003788):0.009743):0.001299,(219923469:0.072965,125629132:0.044638):0.012516):0.011647,(((((219923464:0.069894,((((((125628927:0.021470,219923456:0.021406):0.003083,219923455:0.021625):0.029147,219923428:0.042785):0.001234,225685777:0.037478):0.016027,((((56549933:0.003265,219923453:-0.000859):0.015462,150371743:0.009558):0.004969,219923452:0.014401):0.024398,((((((150371732:0.001735,((150371733:0,150371736:0):6.195e-05,150371735:-6.195e-05):7.410e-05):0.000580,150371734:0.001196):0.000767,(150371737:0.001274,(150371738:0,150371740:0):0.000551):0.000498):0.000905,70608555:0.003205):0.004807,150371741:0.010751):8.979e-05,150371739:0.006647):0.022090):0.012809):0.011838,219923427:0.057366):0.009364):0.004238,((219923450:0.022699,125628925:0.012519):0.048088,219923466:0.046514):0.003608):0.007025,((56549930:0.067920,219923440:0.059754):0.002384,((219923438:0.044329,219923439:0.038470):0.014514,(219923442:0.038021,(((207366060:0,207366061:0):0.001859,125628920:0.001806):0.024716,((((125628921:0.005610,207366057:0.003531):0.001354,(207366055:0.003311,207366056:0.002174):0.003225):0.011836,207366062:0.019303):0.003741,((((((207366047:0,207366048:0):0,207366049:0):0.001563,207366050:0.000272):0.002214,(207366051:0.000818,125628919:0.001017):0.000675):0.003916,207366054:0.007924):0.004138,((219923441:0.000975,207366052:-0.000975):0.000494,207366053:-0.000494):0.012373):0.010040):0.003349):0.017594):0.011029):-0.003134):0.011235):0.004149,((((219923435:0.064354,219923424:0.067340):0.002972,219923454:0.045087):0.002092,((219923460:0.027282,219923465:0.025756):0.031269,(219923462:0.017555,219923425:-0.009591):0.047358):0.006198):0.004242,(((219923463:0.031885,(219923459:0.000452,219923458:-0.000452):0.029292):0.005200,225685776:0.024691):0.020131,219923461:0.042563):0.004673):0.009128):0.001452,((56549934:0.088142,56549929:0.066475):0.004212,(219923437:0.048313,219923436:0.044997):0.014553):0.008927):0);</div>
...to this (you will need an SVG-capable browser to see anything). The Javascript parses the Newick tree, generates SVG, then replaces the Newick tree in the HTML with the corresponding picture. No need for server-side graphics, the diagram is generated by your web browser based on the Newick tree description.
((((((((219923430:0.046474,219923429:0.009145):0.037428,219923426:0.038397):0.015434,(219923419:0.022612,219923420:0.015561):0.050529):0.004828,(207366059:0.020922,207366058:0.016958):0.038734):0.003901,219923422:0.072942):0.005414,((219923443:0.038239,219923444:0.025617):0.037592,(219923423:0.056081,219923421:0.055808):0.003788):0.009743):0.001299,(219923469:0.072965,125629132:0.044638):0.012516):0.011647,(((((219923464:0.069894,((((((125628927:0.021470,219923456:0.021406):0.003083,219923455:0.021625):0.029147,219923428:0.042785):0.001234,225685777:0.037478):0.016027,((((56549933:0.003265,219923453:-0.000859):0.015462,150371743:0.009558):0.004969,219923452:0.014401):0.024398,((((((150371732:0.001735,((150371733:0,150371736:0):6.195e-05,150371735:-6.195e-05):7.410e-05):0.000580,150371734:0.001196):0.000767,(150371737:0.001274,(150371738:0,150371740:0):0.000551):0.000498):0.000905,70608555:0.003205):0.004807,150371741:0.010751):8.979e-05,150371739:0.006647):0.022090):0.012809):0.011838,219923427:0.057366):0.009364):0.004238,((219923450:0.022699,125628925:0.012519):0.048088,219923466:0.046514):0.003608):0.007025,((56549930:0.067920,219923440:0.059754):0.002384,((219923438:0.044329,219923439:0.038470):0.014514,(219923442:0.038021,(((207366060:0,207366061:0):0.001859,125628920:0.001806):0.024716,((((125628921:0.005610,207366057:0.003531):0.001354,(207366055:0.003311,207366056:0.002174):0.003225):0.011836,207366062:0.019303):0.003741,((((((207366047:0,207366048:0):0,207366049:0):0.001563,207366050:0.000272):0.002214,(207366051:0.000818,125628919:0.001017):0.000675):0.003916,207366054:0.007924):0.004138,((219923441:0.000975,207366052:-0.000975):0.000494,207366053:-0.000494):0.012373):0.010040):0.003349):0.017594):0.011029):-0.003134):0.011235):0.004149,((((219923435:0.064354,219923424:0.067340):0.002972,219923454:0.045087):0.002092,((219923460:0.027282,219923465:0.025756):0.031269,(219923462:0.017555,219923425:-0.009591):0.047358):0.006198):0.004242,(((219923463:0.031885,(219923459:0.000452,219923458:-0.000452):0.029292):0.005200,225685776:0.024691):0.020131,219923461:0.042563):0.004673):0.009128):0.001452,((56549934:0.088142,56549929:0.066475):0.004212,(219923437:0.048313,219923436:0.044997):0.014553):0.008927):0);
Here's the same tree as a phylogram:
((((((((219923430:0.046474,219923429:0.009145):0.037428,219923426:0.038397):0.015434,(219923419:0.022612,219923420:0.015561):0.050529):0.004828,(207366059:0.020922,207366058:0.016958):0.038734):0.003901,219923422:0.072942):0.005414,((219923443:0.038239,219923444:0.025617):0.037592,(219923423:0.056081,219923421:0.055808):0.003788):0.009743):0.001299,(219923469:0.072965,125629132:0.044638):0.012516):0.011647,(((((219923464:0.069894,((((((125628927:0.021470,219923456:0.021406):0.003083,219923455:0.021625):0.029147,219923428:0.042785):0.001234,225685777:0.037478):0.016027,((((56549933:0.003265,219923453:-0.000859):0.015462,150371743:0.009558):0.004969,219923452:0.014401):0.024398,((((((150371732:0.001735,((150371733:0,150371736:0):6.195e-05,150371735:-6.195e-05):7.410e-05):0.000580,150371734:0.001196):0.000767,(150371737:0.001274,(150371738:0,150371740:0):0.000551):0.000498):0.000905,70608555:0.003205):0.004807,150371741:0.010751):8.979e-05,150371739:0.006647):0.022090):0.012809):0.011838,219923427:0.057366):0.009364):0.004238,((219923450:0.022699,125628925:0.012519):0.048088,219923466:0.046514):0.003608):0.007025,((56549930:0.067920,219923440:0.059754):0.002384,((219923438:0.044329,219923439:0.038470):0.014514,(219923442:0.038021,(((207366060:0,207366061:0):0.001859,125628920:0.001806):0.024716,((((125628921:0.005610,207366057:0.003531):0.001354,(207366055:0.003311,207366056:0.002174):0.003225):0.011836,207366062:0.019303):0.003741,((((((207366047:0,207366048:0):0,207366049:0):0.001563,207366050:0.000272):0.002214,(207366051:0.000818,125628919:0.001017):0.000675):0.003916,207366054:0.007924):0.004138,((219923441:0.000975,207366052:-0.000975):0.000494,207366053:-0.000494):0.012373):0.010040):0.003349):0.017594):0.011029):-0.003134):0.011235):0.004149,((((219923435:0.064354,219923424:0.067340):0.002972,219923454:0.045087):0.002092,((219923460:0.027282,219923465:0.025756):0.031269,(219923462:0.017555,219923425:-0.009591):0.047358):0.006198):0.004242,(((219923463:0.031885,(219923459:0.000452,219923458:-0.000452):0.029292):0.005200,225685776:0.024691):0.020131,219923461:0.042563):0.004673):0.009128):0.001452,((56549934:0.088142,56549929:0.066475):0.004212,(219923437:0.048313,219923436:0.044997):0.014553):0.008927):0);

ZooBank data model

I'm trying to get my head around the data model used by ZooBank to store taxonomic names. To do this, I've built a graph for the species Belonoperca pylei described by Baldwin & Smith described in:
Baldwin, C. C., & Smith, W. L. (1998). Belonoperca pylei, a new species of seabass (Teleostei: Serranidae: Epinephelinae: Diploprionini) from the cook islands with comments on relationships among diploprionins. Ichthyological Research, 45(4), 325–339. doi:10.1007/BF02725185

After extracting some data from ZooBank API I created a DOT file connecting the various "taxon name usages" associated with Belonoperca pylei and constructed a graph using GraphViz:
Zoobank
You can grab the DOT file here, and a bigger version of the image is on Flickr. I've labelled taxon names and references with plain text as well as the UUIDs that serve as identifiers in ZooBank. (Update: the original diagram had Belonoperca pylei Baldwin & Smith, 1998 sensu Eschmeyer [9F53EF10-30EE-4445-A071-6112D998B09B] in the wrong place, which I've now fixed.)

This is a fairly simple case of a single species, but it's already starting to look a tad complicated. We have Belonoperca pylei Baldwin & Smith, 1998 linked to its original description (doi:10.1007/BF02725185) and to the genus Belonoperca Fowler & Bean, 1930 (linked to its original publication http://biostor.org/reference/105997) as interpreted by ("sensu") Baldwin & Smith, 1998. Belonoperca Fowler & Bean 1930 sensu Baldwin & Smith 1998 is linked to the original use of that genus (i.e., Belonoperca Fowler & Bean, 1930). Then we have the species Belonoperca pylei Baldwin & Smith, 1998 as understood in Eschmeyer's 2004 checklist.

Notice that each usage of a taxon name gets linked back to a previous usage, and names are linked to higher names in a taxonomic hierarchy. When the species Belonoperca pylei was described it was placed in the genus Belonoperca, when Belonoperca was described it was placed in the family Serranidae, and so on.

Fuzzy matching taxonomic names using ngrams

Quick note to self about possible way to using fuzzy matching when searching for taxonomic names. Now that I'm using Cloudant to host CouchDB databases (e.g., see BioStor in the the cloud) I'd like to have a way to support fuzzy matching so that if I type in a name and misspelt it, there's a reasonable chance I will still find that name. This is the "did you mean?" feature beloved by Google users. There are various ways to tackle this problem, and Tony Rees' TAXAMATCH is perhaps the best known solution.

Cloudant supports Lucence for full text searching, but while this allows some possibility for approximate matching (by appending "~" to the search string) initial experiments suggested it wasn't going to be terribly useful. What does seem to work is to use ngrams. As a crude example, here is a CouchDN view that converts a string (in this case a taxon name) to a series of trigrams (three letter strings) then indexes their concatenation.


{
"_id": "_design/taxonname",
"language": "javascript",
"indexes": {
"all": {
"index": "function(doc) { if (doc.docType == 'taxonName') { var n = doc.nameComplete.length; var ngrams = []; for (var i=0; i < n-2;i++) { var ngram = doc.nameComplete.charAt(i) + doc.nameComplete.charAt(i+1) + doc.nameComplete.charAt(i+2); ngrams.push(ngram); } if (n > 2) { ngrams.push('$' + doc.nameComplete.charAt(0) + doc.nameComplete.charAt(1)); ngrams.push(doc.nameComplete.charAt(n-2) + doc.nameComplete.charAt(n-1) + '$'); } ngrams.sort(); index(\"default\", ngrams.join(' '), {\"store\": \"yes\"}); } }"
}
}
}

To search this view for a name I then generate trigrams for the query string (e.g., "Pomatomix" becomes "$Po Pom oma mat ato tom omi mix ix$" where "$" signals the start or end of the string) and search on that. For example, append this string to the URL of the CouchDB database to search for "Pomatomix":


_design/taxonname/_search/all?q=$Po%20Pom%20oma%20mat%20ato%20tom%20omi%20mix%20ix$&include_docs=true&limit=10


Initial results are promising (searching on bigrams generated an alarming degree of matches that seemed rather dubious). I need to do some more work on this, but it might be a simple and quick way to support "did you mean?" for taxonomic names.