Alignment-free Sequence Searching over Whole Genomes Using 3D Random Plot of Query DNA Sequences

Da-Young Lee, Hae-Sung Tak, Han-Ho Kim, Hwan-Gue Cho

Abstract


Most genomic data studies are based on sequence comparisons and searches, and comparison models based on alignment algorithms are most commonly used. This method is very accurate, but it is useful when the query is short in kilobytes, because it requires the quadratic time and space complexity, O(n2) where n is the length of target and query sequences. With the development of Next Generation Sequencing techniques, researches on whole genome sequence data of megabyte size are being actively studied, and new comparison and search methods for large-scale sequence data are needed. We propose a new alignment-free sequence comparison and search method to overcome the limitations of the alignment-based model. In this graphical model, the sequence searching problem in DNA strings can be reduced to find some parts of geometric object within a relatively small-scale geometric space. When comparing similarity by modifying sequences of similar length, we can confirm that the comparison model is appropriate by accurately reflecting the degree of similarity. When searching the query sequence comparison model based on 200MB sized whole genome sequence, using the compressed coordinate information, it was able to search the 10MB sequences in 22s, which is a very reduced time compared to alignment. Although it is not possible to find the exact position of the base pair unit as in the alignment result, it is a model that can be used as a preprocessing process to quickly search a whole genome sequence of several hundred megabytes-size.

Full Text:

PDF

References


% Genome Sequence Visualization

@article{altschul1990basic,

title={Basic local alignment search tool},

author={Altschul, Stephen F and Gish, Warren and Miller, Webb and Myers, Eugene W and Lipman, David J},

journal={Journal of molecular biology},

volume={215},

number={3},

pages={403--410},

year={1990},

publisher={Elsevier}

}

@article{liao20063d,

title={A 3D graphical representation of DNA sequences and its application},

author={Liao, Bo and Ding, Kequan},

journal={Theoretical Computer Science},

volume={358},

number={1},

pages={56--64},

year={2006},

publisher={Elsevier}

}

@article{nakamura2013shape,

title={A shape-based similarity measure for time series data with ensemble learning},

author={Nakamura, Tetsuya and Taki, Keishi and Nomiya, Hiroki and Seki, Kazuhiro and Uehara, Kuniaki},

journal={Pattern Analysis and Applications},

volume={16},

number={4},

pages={535--548},

year={2013},

publisher={Springer}

}

@article{randic2003compact,

title={Compact 2-D graphical representation of DNA},

author={Randi{'c}, Milan and Vra{v{c}}ko, Marjan and Zupan, Jure and Novi{v{c}}, Marjana},

journal={Chemical physics letters},

volume={373},

number={5},

pages={558--562},

year={2003},

publisher={Elsevier}

}

@article{randic2004graphical,

title={Graphical representations of DNA as 2-D map},

author={Randi{'c}, Milan},

journal={Chemical Physics Letters},

volume={386},

number={4},

pages={468--471},

year={2004},

publisher={Elsevier}

}

@article{wu2003db,

title={DB-Curve: a novel 2D method of DNA sequence visualization and representation},

author={Wu, Yonghui and Liew, Alan Wee-Chung and Yan, Hong and Yang, Mengsu},

journal={Chemical Physics Letters},

volume={367},

number={1},

pages={170--176},

year={2003},

publisher={Elsevier}

}

@article{li2009hl,

title={H-L curve: a novel 2D graphical representation of protein sequences},

author={Li, Yongfan and Huang, Guohua and Liao, Bo and Liu, Zanbo},

journal={MATCH-COMMUNICATIONS IN MATHEMATICAL AND IN COMPUTER CHEMISTRY},

volume={61},

number={2},

pages={519--532},

year={2009},

publisher={UNIV KRAGUJEVAC, FAC SCIENCE PO BOX 60, KRAGUJEVAC 34000, SERBIA}

}

@article{randic2005four,

title={Four-color map representation of DNA or RNA sequences and their numerical characterization},

author={Randi{'c}, Milan and Ler{v{s}}, Nella and Plav{v{s}}i{'c}, Dejan and Basak, Subhash C and Balaban, Alexandru T},

journal={Chemical physics letters},

volume={407},

number={1},

pages={205--208},

year={2005},

publisher={Elsevier}

}

@article{krzywinski2009circos,

title={Circos: an information aesthetic for comparative genomics},

author={Krzywinski, Martin and Schein, Jacqueline and Birol, Inanc and Connors, Joseph and Gascoyne, Randy and Horsman, Doug and Jones, Steven J and Marra, Marco A},

journal={Genome research},

volume={19},

number={9},

pages={1639--1645},

year={2009},

publisher={Cold Spring Harbor Lab}

}

@article{an2015j,

title={J-Circos: an interactive Circos plotter},

author={An, Jiyuan and Lai, John and Sajjanhar, Atul and Batra, Jyotsna and Wang, Chenwei and Nelson, Colleen C},

journal={Bioinformatics},

volume={31},

number={9},

pages={1463--1465},

year={2015},

publisher={Oxford University Press}

}

% Genome Sequence Visualization based analysis

@article{randic2003analysis,

title={Analysis of similarity/dissimilarity of DNA sequences based on novel 2-D graphical representation},

author={Randi{'c}, Milan and Vra{v{c}}ko, Marjan and Ler{v{s}}, Nella and Plav{v{s}}i{'c}, Dejan},

journal={Chemical Physics Letters},

volume={371},

number={1},

pages={202--207},

year={2003},

publisher={Elsevier}

}

@article{randic2006novel,

title={A novel unexpected use of a graphical representation of DNA: Graphical alignment of DNA sequences},

author={Randi{'c}, Milan and Zupan, Jure and Viki{'c}-Topi{'c}, Dra{v{z}}en and Plav{v{s}}i{'c}, Dejan},

journal={Chemical Physics Letters},

volume={431},

number={4},

pages={375--379},

year={2006},

publisher={Elsevier}

}

@article{huang2009similarity,

title={Similarity studies of DNA sequences based on a new 2D graphical representation},

author={Huang, Guohua and Liao, Bo and Li, Yongfan and Yu, Yougui},

journal={Biophysical chemistry},

volume={143},

number={1},

pages={55--59},

year={2009},

publisher={Elsevier}

}

@article{kim1997visualization,

title={A visualization technique for DNA walk plot using k-convex hull},

author={Kim, Min-Ah and Lee, Eun-Jeong and Cho, Hwan-Gue and Park, Kie-Jung},

year={1997},

volume={5},

number={1-3},

pages={212-221},

journal={Journal of WSCG},

publisher={V{'a}clav Skala-UNION Agency}

}

@article{da2016comparison,

title={Comparison-specialized visualization model for whole genome sequences},

author={Da-Young, Lee and Kyung-Rim, Kim and Taeyong, Kim and Hwan-Gue, Cho},

year={2016},

volume={24},

number={2},

pages={43-52},

journal={Journal of WSCG},

publisher={V{'a}clav Skala-UNION Agency}

}

@inproceedings{2016webgl,

author={Dayoung Lee,Daegeon Kwon,Hwan-gue Cho},

title={{Web-GL based Visualization System for Whole Genomes}},

booktitle={Proceedings of KIISE},

publisher={KOREA INFORMATION SCIENCE SOCIETY},

year={2016},

pages={1414-1416},

}

}

@MastersThesis{2015opengl,

title={Whole Genome Data Visualization and Analysis Using 3D Random Walk Plot},

author={Daegeon Kwon},

year={2015},

school={Pusan National University}

}

@article{pasechnik2005dynamical,

title={Dynamical visualization of the DNA sequence and its nucleotide content},

author={Pasechnik, Alexey and Myll{"a}ri, Aleksandr and Salakoski, Tapio and Myll{"a}ri, A and Salakoski, T and Salakoski, T},

journal={Proceedings of KRBIO},

volume={5},

pages={47--50},

year={2005}

}

@inproceedings{sandes2010cudalign,

title={CUDAlign: using GPU to accelerate the comparison of megabase genomic sequences},

author={Sandes, Edans Flavius O and de Melo, Alba Cristina},

booktitle={ACM Sigplan Notices},

volume={45},

number={5},

pages={137--146},

year={2010},

organization={ACM}

}

@article{vinga2003alignment,

title={Alignment-free sequence comparison—a review},

author={Vinga, Susana and Almeida, Jonas},

journal={Bioinformatics},

volume={19},

number={4},

pages={513--523},

year={2003},

publisher={Oxford University Press}

}




DOI: https://doi.org/10.31449/inf.v42i3.2276

Creative Commons License
This work is licensed under a Creative Commons Attribution 3.0 License.