diff --git a/src/bwa/COPYING b/src/bwa/COPYING new file mode 100644 index 000000000..94a9ed024 --- /dev/null +++ b/src/bwa/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/src/bwa/ChangeLog b/src/bwa/ChangeLog new file mode 100644 index 000000000..403e61fa6 --- /dev/null +++ b/src/bwa/ChangeLog @@ -0,0 +1,3864 @@ +------------------------------------------------------------------------ +r1605 | lh3 | 2010-12-29 20:20:20 -0500 (Wed, 29 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.9rc1-2 (r1605) + * fixed a typo/bug in bwasw + +------------------------------------------------------------------------ +r1587 | lh3 | 2010-12-21 18:48:30 -0500 (Tue, 21 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +a typo in the manual + +------------------------------------------------------------------------ +r1586 | lh3 | 2010-12-21 18:47:48 -0500 (Tue, 21 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + + * bwa-0.5.9rc1-1 (r1586) + * a few patches by John + +------------------------------------------------------------------------ +r1562 | lh3 | 2010-12-10 01:02:06 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + +documentation on specifying @RG + +------------------------------------------------------------------------ +r1561 | lh3 | 2010-12-10 00:45:40 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.9rc1 (r1561) + +------------------------------------------------------------------------ +r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + + * fixed a small memory leak caused by the BAM reader + * fixed a memory violation, also in the BAM reader + +------------------------------------------------------------------------ +r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/Makefile + +change Makefile gcc options + +------------------------------------------------------------------------ +r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-6 (r1557) + * added a little more comments to BWA-SW + * randomly choosing a mapping if there are more than one + +------------------------------------------------------------------------ +r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + +sometimes unmapped reads may not be printed... + +------------------------------------------------------------------------ +r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + +print unmapped reads + +------------------------------------------------------------------------ +r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-5 (r1555) + * BAM input documentation + +------------------------------------------------------------------------ +r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-4 (r1544) + * supporting adding RG tags and RG lines + +------------------------------------------------------------------------ +r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-3 (r1543) + * fixed a memory leak + +------------------------------------------------------------------------ +r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-2 (r1542) + * fixed a long existing bug in random placement of reads + +------------------------------------------------------------------------ +r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bamlite.c + A /branches/prog/bwa/bamlite.h + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + +preliminary BAM input support + +------------------------------------------------------------------------ +r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +change version number and ChangeLog + +------------------------------------------------------------------------ +r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * fixed a bug in the scoring matrix + * release bwa-0.5.8c (r1536) + +------------------------------------------------------------------------ +r1451 | lh3 | 2010-06-15 09:43:52 -0400 (Tue, 15 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +version change + +------------------------------------------------------------------------ +r1450 | lh3 | 2010-06-15 09:42:21 -0400 (Tue, 15 Jun 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.5.8b (r1450) + * fixed a bug in scoring matrix + +------------------------------------------------------------------------ +r1445 | lh3 | 2010-06-11 08:58:33 -0400 (Fri, 11 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + +fixed a serious bug + +------------------------------------------------------------------------ +r1442 | lh3 | 2010-06-08 10:22:14 -0400 (Tue, 08 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.8 (r1442) + +------------------------------------------------------------------------ +r1440 | lh3 | 2010-05-19 13:43:50 -0400 (Wed, 19 May 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-r1440 + * sorry, forget to remove a debugging line + +------------------------------------------------------------------------ +r1439 | lh3 | 2010-05-19 13:43:08 -0400 (Wed, 19 May 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-r1439 + * fixed a bug in bwasw caused by a recent modification + * throwing insane insert size when estimating isize + +------------------------------------------------------------------------ +r1425 | lh3 | 2010-04-29 15:15:23 -0400 (Thu, 29 Apr 2010) | 10 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-7 (r1425) + * fixed a minor bug in bwasw command-line parsing + * When band-width is not large enough, bwasw may find two highly + overlapping but not completely overlapping alignments. The old + version will filter out one of them, which leads to false + negatives. The current outputs both. This solution is obviously not + ideal. The ideal one would be to increase the band-width and redo the + alignment. + + +------------------------------------------------------------------------ +r1399 | lh3 | 2010-04-16 09:20:49 -0400 (Fri, 16 Apr 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-6 (r1399) + * fixed a typo/bug (by Vaughn Iverson) + +------------------------------------------------------------------------ +r1329 | lh3 | 2010-03-19 23:32:46 -0400 (Fri, 19 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + +small correction + +------------------------------------------------------------------------ +r1328 | lh3 | 2010-03-19 23:28:44 -0400 (Fri, 19 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-4 (r1328) + * automatically adjust ap_prior based on alignment + +------------------------------------------------------------------------ +r1327 | lh3 | 2010-03-19 23:02:40 -0400 (Fri, 19 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.5.7-3 (r1327) + * evaluate hits obtained from SW alignment in a more proper way. + +------------------------------------------------------------------------ +r1320 | lh3 | 2010-03-17 15:13:22 -0400 (Wed, 17 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + +fixed a potential out-of-boundary error. Need more testing. + +------------------------------------------------------------------------ +r1319 | lh3 | 2010-03-14 22:44:46 -0400 (Sun, 14 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + +insert size is `weird' if the 3rd quatile larger than 100,000bp + +------------------------------------------------------------------------ +r1318 | lh3 | 2010-03-14 22:37:35 -0400 (Sun, 14 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.7-2 (r1318) + * in sampe, allow to disable insert size estimate + +------------------------------------------------------------------------ +r1317 | lh3 | 2010-03-14 22:14:14 -0400 (Sun, 14 Mar 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/solid2fastq.pl + + * bwa-0.5.7-1 (r1317) + * fixed a potential bug in solid2fastq.pl + * fixed a bug in calculating mapping quality (by Rodrigo Goya) + * fixed a very rare bug (if ever occur) about pairing + +------------------------------------------------------------------------ +r1310 | lh3 | 2010-03-01 10:35:45 -0500 (Mon, 01 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.7 + +------------------------------------------------------------------------ +r1309 | lh3 | 2010-02-26 21:42:22 -0500 (Fri, 26 Feb 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.6-2 (r1309) + * fixed an unfixed bug (by Carol Scott) + * fixed some tiny formatting + +------------------------------------------------------------------------ +r1305 | lh3 | 2010-02-25 13:47:58 -0500 (Thu, 25 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.6-1 (r1304) + * optionally write output to a file (by Tim Fennel) + +------------------------------------------------------------------------ +r1303 | lh3 | 2010-02-10 23:43:48 -0500 (Wed, 10 Feb 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.6 + +------------------------------------------------------------------------ +r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.5-10 (r1302) + * improve max insert size estimate (method suggested by Gerton Lunter) + +------------------------------------------------------------------------ +r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-9 (r1301) + * improve mapping quality calculation for abnomalous pairs + * fixed a bug in multiple hits + * SOLiD multiple hits should work now + +------------------------------------------------------------------------ +r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-8 (r1300) + * output kurtosis + +------------------------------------------------------------------------ +r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-7 (r1299) + * calculate skewness in sampe + * increase min_len in SW to 20 + * perform more SW to fix discordant pairs + +------------------------------------------------------------------------ +r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.5.5-6 (r1297) + * prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya) + +------------------------------------------------------------------------ +r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/solid2fastq.pl + +the old fix seems not working! + +------------------------------------------------------------------------ +r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-5 (r1296) + * fixed a minor issue that the lower bound of insert size is not correctly set. + +------------------------------------------------------------------------ +r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-4 (r1295) + * fixed a memory leak + * change the behaviour of -n (samse and sampe) + * change the default of -n + +------------------------------------------------------------------------ +r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.5-3 (r1294) + * improved multi-hit report + +------------------------------------------------------------------------ +r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/solid2fastq.pl + + * bwa-0.5.5-2 (r1293) + * bugfix: truncated quality string + * bugfix: quality -1 in solid->fastq conversion + * bugfix: color reads on the reverse strand is not complemented + +------------------------------------------------------------------------ +r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwase.c + A /branches/prog/bwa/bwase.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-1 (r1279) + * incorporate changes from Matt Hanna for Java bindings. + +------------------------------------------------------------------------ +r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + A /branches/prog/bwa/qualfa2fq.pl + +Release bwa-0.5.5 (r1273) + +------------------------------------------------------------------------ +r1272 | lh3 | 2009-11-10 22:02:50 -0500 (Tue, 10 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-3 (r1272) + * fixed another typo which may lead to incorrect single-end mapping quality + +------------------------------------------------------------------------ +r1271 | lh3 | 2009-11-10 21:59:47 -0500 (Tue, 10 Nov 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-2 (r1271) + * fixed a serious typo/bug which does not hurt if we allow one gap open + and work with <200bp reads, but causes segfault for long reads. + +------------------------------------------------------------------------ +r1270 | lh3 | 2009-11-09 23:12:42 -0500 (Mon, 09 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-1 (r1270) + * fixed a bug in color alignment + +------------------------------------------------------------------------ +r1245 | lh3 | 2009-10-09 07:42:52 -0400 (Fri, 09 Oct 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.4 + +------------------------------------------------------------------------ +r1244 | lh3 | 2009-10-09 05:53:52 -0400 (Fri, 09 Oct 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.5.3-4 (r1244) + * output the clipped length in XC:i: tag + * skip mate alignment when stdaln is buggy + * fixed a bug in NM:i: tag + +------------------------------------------------------------------------ +r1243 | lh3 | 2009-10-07 08:15:04 -0400 (Wed, 07 Oct 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.3-3 (r1243) + * sampe: fixed a bug when a read sequence is identical its reverse complement. + +------------------------------------------------------------------------ +r1242 | lh3 | 2009-10-07 07:49:13 -0400 (Wed, 07 Oct 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.3-2 (r1242) + * sampe: optionall preload the full index into memory + * aln: change the default seed length to 32bp + +------------------------------------------------------------------------ +r1238 | lh3 | 2009-09-26 18:38:15 -0400 (Sat, 26 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/khash.h + +Improve portability of khash.h + +------------------------------------------------------------------------ +r1228 | lh3 | 2009-09-15 09:20:22 -0400 (Tue, 15 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +fixed a typo + +------------------------------------------------------------------------ +r1227 | lh3 | 2009-09-15 09:19:35 -0400 (Tue, 15 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.3-1 (r1226) + * in dBWT-SW, optionall use hard clipping instead of soft clipping + +------------------------------------------------------------------------ +r1225 | lh3 | 2009-09-15 08:32:30 -0400 (Tue, 15 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.3 (r1225) + +------------------------------------------------------------------------ +r1223 | lh3 | 2009-09-13 07:30:41 -0400 (Sun, 13 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.2 + +------------------------------------------------------------------------ +r1222 | lh3 | 2009-09-11 09:11:39 -0400 (Fri, 11 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-5 (r1222) + * fixed a typo. No real change + +------------------------------------------------------------------------ +r1221 | lh3 | 2009-09-11 09:09:44 -0400 (Fri, 11 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.1-4 (r1221) + * trim reads before alignment + +------------------------------------------------------------------------ +r1216 | lh3 | 2009-09-08 17:50:15 -0400 (Tue, 08 Sep 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.1-3 (r1216) + * fixed a bug about NM tags for gapped alignment + * print SAM header + +------------------------------------------------------------------------ +r1215 | lh3 | 2009-09-08 17:14:42 -0400 (Tue, 08 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-2 (r1215) + * fixed a bug when read lengths vary (by John Marshall) + +------------------------------------------------------------------------ +r1213 | lh3 | 2009-09-06 18:58:15 -0400 (Sun, 06 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-1 (r1213) + * change default -T to 30 + +------------------------------------------------------------------------ +r1209 | lh3 | 2009-09-02 06:06:02 -0400 (Wed, 02 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.1 + +------------------------------------------------------------------------ +r1208 | lh3 | 2009-09-02 05:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + + * ChangeLog + +------------------------------------------------------------------------ +r1206 | lh3 | 2009-08-30 18:27:30 -0400 (Sun, 30 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-6 (r1206) + * fixed two bugs caused by previous modification + +------------------------------------------------------------------------ +r1205 | lh3 | 2009-08-30 17:28:36 -0400 (Sun, 30 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-4 (r1205) + * reduce false coordinates and CIGAR when a query bridges two reference + sequences, although some very rare cases may fail bwa. + +------------------------------------------------------------------------ +r1204 | lh3 | 2009-08-30 06:06:16 -0400 (Sun, 30 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-3 (r1204) + * choose one repetitive hit to extend + +------------------------------------------------------------------------ +r1203 | lh3 | 2009-08-29 18:11:51 -0400 (Sat, 29 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-2 (r1203) + * dBWT-SW: change a parameter in calculating mapping quality + * fixed a bug in samse + +------------------------------------------------------------------------ +r1202 | lh3 | 2009-08-28 19:48:41 -0400 (Fri, 28 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-1 (r1202) + * change default band width to 50 + * improve mapping quality a bit + +------------------------------------------------------------------------ +r1200 | lh3 | 2009-08-20 06:21:24 -0400 (Thu, 20 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.0 (r1200) + +------------------------------------------------------------------------ +r1199 | lh3 | 2009-08-20 04:49:15 -0400 (Thu, 20 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +Updated ChangeLog and the manual + +------------------------------------------------------------------------ +r1198 | lh3 | 2009-08-19 11:09:15 -0400 (Wed, 19 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-36 (r1198) + * simplify duphits removal. The accuracy is changed a tiny bit, sometimes better, sometimes worse. + +------------------------------------------------------------------------ +r1197 | lh3 | 2009-08-19 08:15:05 -0400 (Wed, 19 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + A /branches/prog/bwa/bwtsw2_chain.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-35 (r1197) + * further heuristic acceleration for long queries + +------------------------------------------------------------------------ +r1196 | lh3 | 2009-08-18 06:54:03 -0400 (Tue, 18 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-34 (r1196) + * updated the manual page + * output base quality if the input is fastq + +------------------------------------------------------------------------ +r1195 | lh3 | 2009-08-18 06:23:00 -0400 (Tue, 18 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + + * bwa-0.4.9-33 (r1191) + * fixed a bug in sampe/samse when gaps occur to the 5'-end in SW alignment + * in dbwtsw adjust -T and -c according to -a + +------------------------------------------------------------------------ +r1192 | lh3 | 2009-08-13 05:37:28 -0400 (Thu, 13 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update manual + +------------------------------------------------------------------------ +r1191 | lh3 | 2009-08-12 19:40:51 -0400 (Wed, 12 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_main.c + +update documentation + +------------------------------------------------------------------------ +r1190 | lh3 | 2009-08-12 08:56:10 -0400 (Wed, 12 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-32 (r1190) + * only help messages are changed + +------------------------------------------------------------------------ +r1189 | lh3 | 2009-08-11 09:28:55 -0400 (Tue, 11 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-31 (r1189) + * in bwape/bwase, print CIGAR "*" if the read is unmapped + * improved the calculation of mapping quality + +------------------------------------------------------------------------ +r1181 | lh3 | 2009-08-03 12:09:41 -0400 (Mon, 03 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + +fflush() + +------------------------------------------------------------------------ +r1180 | lh3 | 2009-08-03 12:08:46 -0400 (Mon, 03 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-30 (r1180) + * fixed a memory problem + * multi-threading sometimes does not work... + +------------------------------------------------------------------------ +r1179 | lh3 | 2009-08-03 11:04:39 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-29 (r1179) + * preliminary mutli-threading support in dbwtsw + +------------------------------------------------------------------------ +r1178 | lh3 | 2009-08-03 09:14:54 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-28 (r1178) + * fixed a bug in printing repetitive hits + +------------------------------------------------------------------------ +r1177 | lh3 | 2009-08-03 05:03:42 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-27 (r1177) + * bwtsw2: fixed a hidden memory leak + +------------------------------------------------------------------------ +r1176 | lh3 | 2009-07-31 10:58:24 -0400 (Fri, 31 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-26 + * change the way mapping quality is calculated + +------------------------------------------------------------------------ +r1175 | lh3 | 2009-07-31 09:15:54 -0400 (Fri, 31 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-25 + * code clean up + * automatically adjust ->t and ->is_rev based on input + +------------------------------------------------------------------------ +r1174 | lh3 | 2009-07-30 08:50:25 -0400 (Thu, 30 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-24 + * fixed a bug in printing the hits + +------------------------------------------------------------------------ +r1173 | lh3 | 2009-07-29 18:32:43 -0400 (Wed, 29 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-23 + * allow to skip reverse alignment + * increase opt->t to 37 + +------------------------------------------------------------------------ +r1172 | lh3 | 2009-07-29 17:22:39 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-22 + * report if the hit is found in both directions + +------------------------------------------------------------------------ +r1171 | lh3 | 2009-07-29 17:12:02 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-21 + * dbwtsw: map to both forward and reverse BWT to reduce false alignment + +------------------------------------------------------------------------ +r1170 | lh3 | 2009-07-29 15:25:14 -0400 (Wed, 29 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + +save hits before cut_tail() + +------------------------------------------------------------------------ +r1169 | lh3 | 2009-07-29 08:06:01 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-19 + * use a global memory pool to reduce the CPU time spent on malloc/free(). + +------------------------------------------------------------------------ +r1168 | lh3 | 2009-07-29 06:13:29 -0400 (Wed, 29 Jul 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-18 + * reduce unnecessary extension to the 5'-end + * allow to use different interval size for the 2 rounds + * change default parameters + +------------------------------------------------------------------------ +r1167 | lh3 | 2009-07-28 19:06:17 -0400 (Tue, 28 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-17 + * dbwtsw: fixed THE memory leak. + +------------------------------------------------------------------------ +r1166 | lh3 | 2009-07-28 16:31:41 -0400 (Tue, 28 Jul 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.4.9-16 + * fixed a memory leak + * a small memory leak still occurs to bwtsw2_core(). I will work on that later. + * changed the default parameters + +------------------------------------------------------------------------ +r1165 | lh3 | 2009-07-28 10:15:40 -0400 (Tue, 28 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.4.9-15 + * generate CIGAR right before output. This saves unnecessary computation. + * this version may be buggy as I have not tested it. + +------------------------------------------------------------------------ +r1164 | lh3 | 2009-07-28 09:04:14 -0400 (Tue, 28 Jul 2009) | 11 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-14 + + * deplete unique hits in dbwtsw and postprocess them with standard sw + + * in principle, this stratgy should be faster and more accurate, but I + have not tested this point. I may switch back to the old method if + this does not work. + + * the code looks quite nasty now. it needs clean up... + + +------------------------------------------------------------------------ +r1163 | lh3 | 2009-07-27 17:41:10 -0400 (Mon, 27 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + +change a default parameter + +------------------------------------------------------------------------ +r1162 | lh3 | 2009-07-27 17:04:35 -0400 (Mon, 27 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-13 + * dbwtsw: switch between small and large Z-best + +------------------------------------------------------------------------ +r1161 | lh3 | 2009-07-27 12:17:41 -0400 (Mon, 27 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-12 + * changed the default -z to 100 + * heuristically speed up alignments for polyA reads + +------------------------------------------------------------------------ +r1160 | lh3 | 2009-07-27 07:50:57 -0400 (Mon, 27 Jul 2009) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-11 + + * dbwtsw potentially generates less false alignments, although in + practice, the modification brings no improvement. + + +------------------------------------------------------------------------ +r1159 | lh3 | 2009-07-27 04:37:02 -0400 (Mon, 27 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-10 + * disabled debugging code + * add "BAM_FMU" if both ends are unmapped + +------------------------------------------------------------------------ +r1158 | lh3 | 2009-07-24 09:36:52 -0400 (Fri, 24 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +nothing, really + +------------------------------------------------------------------------ +r1157 | lh3 | 2009-07-24 09:05:44 -0400 (Fri, 24 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-9 + * bwtsw2: generate SAM output + +------------------------------------------------------------------------ +r1156 | lh3 | 2009-07-24 05:42:47 -0400 (Fri, 24 Jul 2009) | 6 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-8 + + * fixed a weird deadloop which only happens to icc -O3. Thanks John + Marshall for the fix. + + +------------------------------------------------------------------------ +r1155 | lh3 | 2009-07-24 05:28:40 -0400 (Fri, 24 Jul 2009) | 8 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-7 + + * fixed a typo in bwtsw2 alignment. Now score from the standard SW + seems to agree with score from bwtsw2, except that in reporting + alignments, bwtsw2 may report non-optimal segments. This is expected, + though. I will improve in future. + + +------------------------------------------------------------------------ +r1154 | lh3 | 2009-07-23 17:40:20 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * aln_left_core() seems to work properly + * aln_local_core() has a bug... AN EVER EXISTING BUG!!!!!!!!!!! + +------------------------------------------------------------------------ +r1153 | lh3 | 2009-07-23 17:06:09 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +removed debugging code... + +------------------------------------------------------------------------ +r1152 | lh3 | 2009-07-23 17:01:00 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + + * radical changes failed... + * fixed a bug + +------------------------------------------------------------------------ +r1151 | lh3 | 2009-07-23 14:46:35 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +temporary changes. Will apply some radical changes to this file... + +------------------------------------------------------------------------ +r1150 | lh3 | 2009-07-23 10:09:56 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/stdaln.c + +fixed a long-existing bug in Smith-Waterman alignment + +------------------------------------------------------------------------ +r1149 | lh3 | 2009-07-23 08:50:52 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-6 + * unexplained inconsistency still occurs, but the results largely look reasonable. + +------------------------------------------------------------------------ +r1148 | lh3 | 2009-07-23 08:07:29 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +half DP + +------------------------------------------------------------------------ +r1147 | lh3 | 2009-07-22 08:03:06 -0400 (Wed, 22 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + +a bit code clean up + +------------------------------------------------------------------------ +r1145 | lh3 | 2009-07-21 15:52:05 -0400 (Tue, 21 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-5 + * fixed a bug in determining sub-optimal hits + * removed some debugging codes + +------------------------------------------------------------------------ +r1144 | lh3 | 2009-07-21 10:17:29 -0400 (Tue, 21 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-4 + * better cmd interface + * faster speed + +------------------------------------------------------------------------ +r1143 | lh3 | 2009-07-20 16:38:18 -0400 (Mon, 20 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + +bwtsw2 (dBWT-SW) is working apparently... + + +------------------------------------------------------------------------ +r1139 | lh3 | 2009-07-15 05:52:18 -0400 (Wed, 15 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-2 + * bwtsw2: change cut_tail() such that it is faster but more likely to + miss true hits + +------------------------------------------------------------------------ +r1138 | lh3 | 2009-07-15 05:18:42 -0400 (Wed, 15 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwt_lite.c + A /branches/prog/bwa/bwt_lite.h + A /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_aux.c + A /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.4.9-1 + * added back bwtsw2 + +------------------------------------------------------------------------ +r1075 | lh3 | 2009-05-19 05:14:50 -0400 (Tue, 19 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.9 + +------------------------------------------------------------------------ +r1073 | lh3 | 2009-05-18 17:13:19 -0400 (Mon, 18 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.8 + +------------------------------------------------------------------------ +r1069 | lh3 | 2009-05-14 09:54:54 -0400 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.7-2 + * change the default of "aln -R" to 30 + +------------------------------------------------------------------------ +r1068 | lh3 | 2009-05-14 09:27:55 -0400 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.7-1 + * search for suboptimal hits if the top hit is not so repetitive + +------------------------------------------------------------------------ +r1066 | lh3 | 2009-05-12 15:31:31 -0400 (Tue, 12 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.7 + +------------------------------------------------------------------------ +r1065 | lh3 | 2009-05-12 15:20:40 -0400 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-9 + * fixed compiling errors on some Linux machines + +------------------------------------------------------------------------ +r1064 | lh3 | 2009-05-12 07:30:46 -0400 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-8 + * avoid compilation error on some systems. + +------------------------------------------------------------------------ +r1035 | lh3 | 2009-05-09 05:41:33 -0400 (Sat, 09 May 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-7 + * fixed an integer overflow caused by previous modifications + * made insert size estimation more robust + +------------------------------------------------------------------------ +r1008 | lh3 | 2009-04-29 05:41:58 -0400 (Wed, 29 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-5 + * fixed a integer overflow problem which may cause seg fault in very rare cases + * made XN tags more accurate + +------------------------------------------------------------------------ +r1005 | lh3 | 2009-04-27 07:37:23 -0400 (Mon, 27 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.6-4 + * heuristic rules to detect suboptimal alignment + * stdsw: support double-strand and protein alignment + +------------------------------------------------------------------------ +r1003 | lh3 | 2009-04-26 12:48:19 -0400 (Sun, 26 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.6-2 + * improve the functionality of stdsw + * allow to add a threshold on SW alignment. Hope this does not incur new bugs... + +------------------------------------------------------------------------ +r1002 | lh3 | 2009-04-22 03:56:15 -0400 (Wed, 22 Apr 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.6-1 + * output SM and AM tag + +------------------------------------------------------------------------ +r914 | lh3 | 2009-03-09 17:53:50 -0400 (Mon, 09 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.4.6 + +------------------------------------------------------------------------ +r913 | lh3 | 2009-03-09 17:23:24 -0400 (Mon, 09 Mar 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + A /branches/prog/bwa/solid2fastq.pl + + * added notes to bwa + * added a script to convert SOLiD reads + * updated documentations + +------------------------------------------------------------------------ +r912 | lh3 | 2009-03-09 16:57:05 -0400 (Mon, 09 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/kstring.c + M /branches/prog/bwa/main.c + +fixed a bug in kstring + +------------------------------------------------------------------------ +r881 | lh3 | 2009-03-02 15:36:06 -0500 (Mon, 02 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtmisc.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-7 + * fixed a bug in pac2cspac + +------------------------------------------------------------------------ +r880 | lh3 | 2009-03-01 16:34:08 -0500 (Sun, 01 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + +disable debugging + +------------------------------------------------------------------------ +r879 | lh3 | 2009-03-01 16:28:04 -0500 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-6 + * fixed problems with coordinates for color gapped alignment + +------------------------------------------------------------------------ +r878 | lh3 | 2009-03-01 13:43:09 -0500 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-5 + * added support for gapped color alignment + +------------------------------------------------------------------------ +r877 | lh3 | 2009-03-01 10:27:52 -0500 (Sun, 01 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * convert cs read to nt read (for ungapped alignment only) + +------------------------------------------------------------------------ +r860 | lh3 | 2009-02-27 08:58:39 -0500 (Fri, 27 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwase.c + A /branches/prog/bwa/cs2nt.c + +prepare to implement cs->nt conversion (have not yet...) + +------------------------------------------------------------------------ +r859 | lh3 | 2009-02-27 07:00:03 -0500 (Fri, 27 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/bwtmisc.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.4.5-3 + * generate color index from nucleotide fasta reference + +------------------------------------------------------------------------ +r857 | lh3 | 2009-02-26 10:22:58 -0500 (Thu, 26 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-2 + * improved mapping quality a bit if one end falls in a tandem repeat + but the mate is unique. + +------------------------------------------------------------------------ +r856 | lh3 | 2009-02-26 10:02:29 -0500 (Thu, 26 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.5-1 + * make bwa work for SOLiD reads + +------------------------------------------------------------------------ +r828 | lh3 | 2009-02-18 17:36:41 -0500 (Wed, 18 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.5 + +------------------------------------------------------------------------ +r827 | lh3 | 2009-02-18 16:48:48 -0500 (Wed, 18 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.4-6 + * fixed a bug in SW alignment when no residue matches + +------------------------------------------------------------------------ +r824 | lh3 | 2009-02-17 05:33:07 -0500 (Tue, 17 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-5 + * fixed that bounary bug + +------------------------------------------------------------------------ +r823 | lh3 | 2009-02-17 04:54:18 -0500 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwape.c + +just change some logging information + +------------------------------------------------------------------------ +r822 | lh3 | 2009-02-17 04:20:39 -0500 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update manual + +------------------------------------------------------------------------ +r821 | lh3 | 2009-02-17 04:11:14 -0500 (Tue, 17 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-4 + * fixed a bug on boundary check in pair_sw + +------------------------------------------------------------------------ +r820 | lh3 | 2009-02-16 17:43:37 -0500 (Mon, 16 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-3 + * allow to change mismatch penalty + +------------------------------------------------------------------------ +r819 | lh3 | 2009-02-16 17:40:28 -0500 (Mon, 16 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-2 + * remove timer + * allow to change default gapo and gape penalty at the command line + +------------------------------------------------------------------------ +r818 | lh3 | 2009-02-16 09:30:51 -0500 (Mon, 16 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update benchmark + +------------------------------------------------------------------------ +r817 | lh3 | 2009-02-16 08:44:40 -0500 (Mon, 16 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/kvec.h + M /branches/prog/bwa/main.c + + * bwa-0.4.4-1 + * automatically detect insert size + * use insert size in pairing. This may potentially improve accuracy (untested!) + +------------------------------------------------------------------------ +r814 | lh3 | 2009-02-15 11:10:23 -0500 (Sun, 15 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.4.4 + +------------------------------------------------------------------------ +r813 | lh3 | 2009-02-15 10:22:50 -0500 (Sun, 15 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-5 + * impose boundary check in refine_gapped + +------------------------------------------------------------------------ +r811 | lh3 | 2009-02-14 09:46:13 -0500 (Sat, 14 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-4 + * change MD tag to match the latest SAM specification + +------------------------------------------------------------------------ +r810 | lh3 | 2009-02-13 04:46:04 -0500 (Fri, 13 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r799 | lh3 | 2009-02-05 12:01:17 -0500 (Thu, 05 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +change MD tag to meet the latest SAM specification + +------------------------------------------------------------------------ +r796 | lh3 | 2009-02-05 08:35:13 -0500 (Thu, 05 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-2 + * fixed a bug on counting 'N' + +------------------------------------------------------------------------ +r795 | lh3 | 2009-02-05 07:41:27 -0500 (Thu, 05 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-1 + * fixed potential boundary problems + * update benchmark result + +------------------------------------------------------------------------ +r791 | lh3 | 2009-01-25 05:20:47 -0500 (Sun, 25 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update some numbers + +------------------------------------------------------------------------ +r790 | lh3 | 2009-01-24 15:13:03 -0500 (Sat, 24 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update benchmark + +------------------------------------------------------------------------ +r789 | lh3 | 2009-01-22 10:18:44 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtindex.c + +a warning message for index + +------------------------------------------------------------------------ +r788 | lh3 | 2009-01-22 09:54:06 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +forget to change release number + +------------------------------------------------------------------------ +r786 | lh3 | 2009-01-22 06:27:39 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + +Release bwa-0.4.3 + +------------------------------------------------------------------------ +r785 | lh3 | 2009-01-22 06:27:16 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + +Release bwa-0.4.3 + +------------------------------------------------------------------------ +r784 | lh3 | 2009-01-22 06:19:59 -0500 (Thu, 22 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-10 + * update documentation + * fixed a bug on generating MD tags for SW alignment + +------------------------------------------------------------------------ +r782 | lh3 | 2009-01-19 12:08:38 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-9 + * fixed a bug in samse -n... + +------------------------------------------------------------------------ +r781 | lh3 | 2009-01-19 11:26:37 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-8 + * given -N, the previous version would stop if the top hit is a repeat. Now changed. + +------------------------------------------------------------------------ +r780 | lh3 | 2009-01-19 11:20:18 -0500 (Mon, 19 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-7 + * use a bit-wise flag to replace some member variables in the option struct + * allow to switch off the iterative strategy + +------------------------------------------------------------------------ +r779 | lh3 | 2009-01-19 10:45:57 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-6 + * allow to dump multiple hits from samse, in another format, though + +------------------------------------------------------------------------ +r778 | lh3 | 2009-01-19 06:24:29 -0500 (Mon, 19 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/kseq.h + A /branches/prog/bwa/kstring.c + A /branches/prog/bwa/kstring.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + + * bwa-0.4.2-5 + * update kseq.h to the latest version + * generate MD tag + * print mate coordinate if only one end is unmapped + +------------------------------------------------------------------------ +r775 | lh3 | 2009-01-18 05:40:35 -0500 (Sun, 18 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-4 + * fixed a bug for SAM format + +------------------------------------------------------------------------ +r774 | lh3 | 2009-01-17 13:48:52 -0500 (Sat, 17 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-3 + * change default fnr to 0.04 + * print max_diff for valid fnr + +------------------------------------------------------------------------ +r773 | lh3 | 2009-01-17 05:54:37 -0500 (Sat, 17 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.2-2 + * automatically choose max_diff + +------------------------------------------------------------------------ +r772 | lh3 | 2009-01-16 18:16:14 -0500 (Fri, 16 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-1 + * take N as a mismatch + +------------------------------------------------------------------------ +r768 | lh3 | 2009-01-09 11:57:23 -0500 (Fri, 09 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.2 + +------------------------------------------------------------------------ +r759 | lh3 | 2009-01-07 09:55:43 -0500 (Wed, 07 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + +Release bwa-0.4.1 + +------------------------------------------------------------------------ +r758 | lh3 | 2009-01-07 05:36:06 -0500 (Wed, 07 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.0-2 + * make mate_sw fully working + +------------------------------------------------------------------------ +r757 | lh3 | 2009-01-06 18:04:29 -0500 (Tue, 06 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.0-1 + * do SW alignment for unmapped mate. It is working. + * I still need to do some extra work for SW alignment, but it is too late + and I am getting tired... I will do tomorrow. + +------------------------------------------------------------------------ +r755 | lh3 | 2009-01-06 10:23:29 -0500 (Tue, 06 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.0 + +------------------------------------------------------------------------ +r754 | lh3 | 2009-01-06 07:45:02 -0500 (Tue, 06 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.3.0-12 + * better lock + +------------------------------------------------------------------------ +r753 | lh3 | 2009-01-06 06:17:21 -0500 (Tue, 06 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-11 + * fixed a small memory leak in bwa_seq_close() + * fixed "uninitialized memory" from bwt_aln1_t + * multithreading for "aln" command + +------------------------------------------------------------------------ +r752 | lh3 | 2009-01-05 17:34:13 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + D /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwt_gen/bwt_gen.c + A /branches/prog/bwa/bwtmisc.c (from /branches/prog/bwa/pac2bwt.c:748) + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + D /branches/prog/bwa/pac2bwt.c + + * bwa-0.3.0-10 + * a little bit code clean up + +------------------------------------------------------------------------ +r751 | lh3 | 2009-01-05 17:19:04 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-9 + * use 64-bit integer to speed up Occ calculate, although just a little bit + +------------------------------------------------------------------------ +r750 | lh3 | 2009-01-05 16:44:26 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-8 + * a little bit code cleanup + +------------------------------------------------------------------------ +r749 | lh3 | 2009-01-05 16:37:28 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-7 + * accelerate Occ calculation + +------------------------------------------------------------------------ +r748 | lh3 | 2009-01-05 16:12:28 -0500 (Mon, 05 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * bwa-0.3.0-6 + * put occ table along with bwt to save another cache miss + * this version is already faster than the previous and I can still improve it... + +------------------------------------------------------------------------ +r747 | lh3 | 2009-01-05 10:16:18 -0500 (Mon, 05 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-5 + * remove occ_major to save a cache miss; however, OCC_INTERVAL has to be + increased to keep the same memory. As a result, the speed is a little + slower in fact. + +------------------------------------------------------------------------ +r746 | lh3 | 2009-01-05 09:50:53 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-4 + * added back optimization codes (it is a pain...) + +------------------------------------------------------------------------ +r745 | lh3 | 2009-01-05 08:23:00 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-3 + * faster bit operations + +------------------------------------------------------------------------ +r744 | lh3 | 2009-01-05 05:58:46 -0500 (Mon, 05 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-2 + * removed optimization codes again... + * use a new method to count the bits + +------------------------------------------------------------------------ +r743 | lh3 | 2009-01-04 17:18:38 -0500 (Sun, 04 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-1 + * added back the optimization codes + * added a new option to aln: max_entries, although this is disabled by default + * updated benchmark + +------------------------------------------------------------------------ +r742 | lh3 | 2009-01-04 07:56:12 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +add URL + +------------------------------------------------------------------------ +r740 | lh3 | 2009-01-04 07:39:43 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.3.0 + +------------------------------------------------------------------------ +r739 | lh3 | 2009-01-04 06:55:06 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + A /branches/prog/bwa/COPYING + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +added licensing information + +------------------------------------------------------------------------ +r738 | lh3 | 2009-01-04 06:18:25 -0500 (Sun, 04 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-31 + * better mapping quality + * update benchmark + +------------------------------------------------------------------------ +r737 | lh3 | 2009-01-03 16:00:58 -0500 (Sat, 03 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r736 | lh3 | 2009-01-02 10:26:38 -0500 (Fri, 02 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r735 | lh3 | 2009-01-02 07:10:20 -0500 (Fri, 02 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-30 + * reduce memory a little bit + * update documentation + +------------------------------------------------------------------------ +r734 | lh3 | 2009-01-01 13:45:45 -0500 (Thu, 01 Jan 2009) | 8 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-29 + * sampe: removed -O option; changed default -o to 100000 + * sampe: fixed a bug in calculating paired mapping quality + * aln: added an option to search for suboptimal hits even if the best is a repeat. + This option will make sampe MUCH SLOWER. + * sampe: set isize as zero if mapped to two different chr + * update manual (unfinished) + +------------------------------------------------------------------------ +r733 | lh3 | 2009-01-01 11:01:20 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-28 + * fixed a bug in calculating paired mapping quality + +------------------------------------------------------------------------ +r732 | lh3 | 2009-01-01 09:27:46 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + A /branches/prog/bwa/khash.h (from /branches/prog/sclib/khash/khash.h:675) + M /branches/prog/bwa/main.c + + * bwa-0.2.0-27 + * accelerate sampe by storing visited large intervals + +------------------------------------------------------------------------ +r731 | lh3 | 2009-01-01 06:51:21 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-26 + * remove the optimation codes + +------------------------------------------------------------------------ +r730 | lh3 | 2009-01-01 06:48:59 -0500 (Thu, 01 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-25 + * accelerate OCC calculation by ~7%. However, it seems not worth doing + this by complicate the codes. I will change back later. + +------------------------------------------------------------------------ +r729 | lh3 | 2008-12-31 16:43:56 -0500 (Wed, 31 Dec 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-24 + * change command "sai2sam_pe" to "sampe" + * print usage for sampe command + * in sampe: change default max_occ to 1000 + * fixed a few compiling warnings in bntseq.c + +------------------------------------------------------------------------ +r728 | lh3 | 2008-12-27 07:14:59 -0500 (Sat, 27 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-22 + * mating information can be printed to SAM + +------------------------------------------------------------------------ +r727 | lh3 | 2008-12-26 18:10:59 -0500 (Fri, 26 Dec 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-21 + * implement pairing (still UNFINISHED) + * output all reads even if full of N + +------------------------------------------------------------------------ +r726 | lh3 | 2008-12-26 13:31:27 -0500 (Fri, 26 Dec 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.2.0-20 + * remove "-t" from aln cmd + * code clean up: move some functions in bwt2fmv.c to other source files + * added sai2sam_pe cmd: *UNFINISHED* + +------------------------------------------------------------------------ +r725 | lh3 | 2008-12-26 07:04:11 -0500 (Fri, 26 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwase.c + A /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/kseq.h + A /branches/prog/bwa/ksort.h (from /branches/prog/sclib/ksort/ksort.h:712) + A /branches/prog/bwa/kvec.h (from /branches/prog/sclib/kvec/kvec.h:537) + M /branches/prog/bwa/main.c + + * bwa-0.2.0-19 + * considerable code cleanup; no actual changes + +------------------------------------------------------------------------ +r724 | lh3 | 2008-12-25 11:32:11 -0500 (Thu, 25 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-18 + * generate SAM output + +------------------------------------------------------------------------ +r723 | lh3 | 2008-12-25 10:48:31 -0500 (Thu, 25 Dec 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.2.0-17 + * remove bwtsw2 related codes + * separate searching for SA interval from generating alignments + +------------------------------------------------------------------------ +r722 | lh3 | 2008-12-25 08:57:13 -0500 (Thu, 25 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + D /branches/prog/bwa/bwt_lite.c + D /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtgap.c + D /branches/prog/bwa/bwtsw2.h + D /branches/prog/bwa/bwtsw2_aux.c + D /branches/prog/bwa/bwtsw2_core.c + D /branches/prog/bwa/bwtsw2_main.c + D /branches/prog/bwa/khash.h + D /branches/prog/bwa/ksort.h + D /branches/prog/bwa/kvec.h + M /branches/prog/bwa/main.c + + * added interface to "aln -t" + * remove bwtsw2 related codes + +------------------------------------------------------------------------ +r666 | lh3 | 2008-11-18 18:34:29 -0500 (Tue, 18 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-16 + * allow to set max mismatches based on read length, but I do not know + whether this really works + +------------------------------------------------------------------------ +r665 | lh3 | 2008-11-18 08:34:03 -0500 (Tue, 18 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-15 + * fixed a bug in sequence parser. + +------------------------------------------------------------------------ +r612 | lh3 | 2008-10-28 06:50:53 -0400 (Tue, 28 Oct 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/utils.c + + * bwa-0.2.0-14 + * fixed a bug caused by the change of the FASTA/Q parser + +------------------------------------------------------------------------ +r611 | lh3 | 2008-10-28 06:24:56 -0400 (Tue, 28 Oct 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/kseq.h + D /branches/prog/bwa/seq.c + D /branches/prog/bwa/seq.h + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +replace seq.* with kseq.h + +------------------------------------------------------------------------ +r610 | lh3 | 2008-10-27 13:00:04 -0400 (Mon, 27 Oct 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-13 + * make bwtsw2 output sub-optimal hits. not completed + +------------------------------------------------------------------------ +r609 | lh3 | 2008-10-24 16:52:00 -0400 (Fri, 24 Oct 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/kvec.h + +little... + +------------------------------------------------------------------------ +r532 | lh3 | 2008-09-19 05:28:45 -0400 (Fri, 19 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/khash.h + +improve interface of khash + +------------------------------------------------------------------------ +r531 | lh3 | 2008-09-18 06:52:59 -0400 (Thu, 18 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +improve minor things, which make bwtsw2 slower, but should miss less true hits + +------------------------------------------------------------------------ +r530 | lh3 | 2008-09-17 18:19:26 -0400 (Wed, 17 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * fixed a bug in calculating ->D + * enforce band-width checking + +------------------------------------------------------------------------ +r529 | lh3 | 2008-09-17 18:06:49 -0400 (Wed, 17 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +delete a line of code that is never visited + +------------------------------------------------------------------------ +r528 | lh3 | 2008-09-17 17:58:51 -0400 (Wed, 17 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +a bit code clean up + +------------------------------------------------------------------------ +r527 | lh3 | 2008-09-17 10:55:45 -0400 (Wed, 17 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-12 + * max-depth can be set, although it does not help the speed at all + +------------------------------------------------------------------------ +r526 | lh3 | 2008-09-16 17:59:36 -0400 (Tue, 16 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +cut_tail after remove duplicate + +------------------------------------------------------------------------ +r525 | lh3 | 2008-09-16 17:56:11 -0400 (Tue, 16 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-11 + * improved cut_tail() + +------------------------------------------------------------------------ +r524 | lh3 | 2008-09-15 16:53:22 -0400 (Mon, 15 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-10 + * fixed a bug in cut_tail() + +------------------------------------------------------------------------ +r518 | lh3 | 2008-09-15 04:35:59 -0400 (Mon, 15 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +a bit code clean up + +------------------------------------------------------------------------ +r517 | lh3 | 2008-09-14 18:18:11 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +improve speed (<1%) + +------------------------------------------------------------------------ +r516 | lh3 | 2008-09-14 18:08:55 -0400 (Sun, 14 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * fixed two potential bugs, although I have not seen their effects + * improve speed a bit (<2%) + +------------------------------------------------------------------------ +r515 | lh3 | 2008-09-14 17:26:49 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + +nothing, really + +------------------------------------------------------------------------ +r514 | lh3 | 2008-09-14 17:10:13 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +disable X-drop, which has to be reimplemented in the current algorithm + +------------------------------------------------------------------------ +r513 | lh3 | 2008-09-14 16:49:42 -0400 (Sun, 14 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + + * temporarily disable cut_tail() + * calculate SA in bwt_lite.c + * fixed a bug in reversing the sequence + +------------------------------------------------------------------------ +r512 | lh3 | 2008-09-13 17:35:40 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + A /branches/prog/bwa/ksort.h + +n-best method + +------------------------------------------------------------------------ +r507 | lh3 | 2008-09-13 09:06:54 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + +give correct result again + +------------------------------------------------------------------------ +r506 | lh3 | 2008-09-13 08:12:07 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +I think I know the reason. It needs more work... + +------------------------------------------------------------------------ +r505 | lh3 | 2008-09-13 06:20:43 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + +fixed another bug, but still have + +------------------------------------------------------------------------ +r504 | lh3 | 2008-09-12 18:13:37 -0400 (Fri, 12 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +fixed another bug + +------------------------------------------------------------------------ +r503 | lh3 | 2008-09-12 17:15:56 -0400 (Fri, 12 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/khash.h + + * do not segfault, but the result is WRONG! + * prepare to remove bsw2_connectivity_check() + +------------------------------------------------------------------------ +r502 | lh3 | 2008-09-12 15:52:41 -0400 (Fri, 12 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/kvec.h + +more revisions + +------------------------------------------------------------------------ +r501 | lh3 | 2008-09-11 18:06:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +further simply codes with kvec.h + +------------------------------------------------------------------------ +r500 | lh3 | 2008-09-11 17:42:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +part of revisions... have not finished + +------------------------------------------------------------------------ +r499 | lh3 | 2008-09-11 17:24:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + A /branches/prog/bwa/kvec.h + +prepare for abrupt change + +------------------------------------------------------------------------ +r496 | lh3 | 2008-09-11 10:34:38 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +fixed a bug; now "bwtsw2 -d" is useless + +------------------------------------------------------------------------ +r495 | lh3 | 2008-09-11 09:22:03 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + +improve speed a little bit + +------------------------------------------------------------------------ +r494 | lh3 | 2008-09-11 08:28:08 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +remove debug codes + +------------------------------------------------------------------------ +r493 | lh3 | 2008-09-11 07:49:53 -0400 (Thu, 11 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * improve the speed a little bit (<5%) + * prepare to remove BSW_DEBUG + +------------------------------------------------------------------------ +r492 | lh3 | 2008-09-11 06:15:56 -0400 (Thu, 11 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-9 + * support reverse strand + * fixed a bug that causes missing hits + +------------------------------------------------------------------------ +r491 | lh3 | 2008-09-11 05:46:16 -0400 (Thu, 11 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-8 + * better progress report + +------------------------------------------------------------------------ +r490 | lh3 | 2008-09-10 17:04:49 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-7 + * avoid some missing hits + * add maximum depth + +------------------------------------------------------------------------ +r489 | lh3 | 2008-09-10 11:51:13 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-6 + * bwtsw2 works although on the forward strand only for now + * better progress information + +------------------------------------------------------------------------ +r488 | lh3 | 2008-09-10 10:21:53 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * implement memory pool + * avoid some rehashing + +------------------------------------------------------------------------ +r487 | lh3 | 2008-09-10 09:23:38 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + + * fixed a memory leak + * prepare to implement mempool + +------------------------------------------------------------------------ +r486 | lh3 | 2008-09-10 09:10:09 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + + * add X-dropoff + * remove duplicated results + * switch to simple stack + +------------------------------------------------------------------------ +r485 | lh3 | 2008-09-10 06:31:20 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + + * check whether t-node has been visited + * prepare to remove two-level stack + +------------------------------------------------------------------------ +r484 | lh3 | 2008-09-10 05:00:57 -0400 (Wed, 10 Sep 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/khash.h + +khash library + +------------------------------------------------------------------------ +r483 | lh3 | 2008-09-10 04:22:53 -0400 (Wed, 10 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +add inline + +------------------------------------------------------------------------ +r482 | lh3 | 2008-09-09 16:34:57 -0400 (Tue, 09 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + +improve speed + +------------------------------------------------------------------------ +r481 | lh3 | 2008-09-09 13:13:00 -0400 (Tue, 09 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +Use a 128bit hash table to keep all (tk,tl,qk,ql). This is slow. Just +keep a copy in case I may need this in future. + + +------------------------------------------------------------------------ +r480 | lh3 | 2008-09-09 12:53:32 -0400 (Tue, 09 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_core.c + + * no principal modification + +------------------------------------------------------------------------ +r479 | lh3 | 2008-09-09 11:01:45 -0400 (Tue, 09 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + + * fixed a bug which may cause duplicated matching + * accelerate the speed a bit, although using hash in avoiding duplications + slows the speed down in the end + +------------------------------------------------------------------------ +r474 | lh3 | 2008-09-03 17:22:57 -0400 (Wed, 03 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-5 + * indel seems to work on toy example + * add band + +------------------------------------------------------------------------ +r469 | lh3 | 2008-09-01 09:18:45 -0400 (Mon, 01 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/is.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/simple_dp.c + + * bwa-0.2.0-4 + * updated bwtsw2, which seems to work properly on toy examples + +------------------------------------------------------------------------ +r447 | lh3 | 2008-08-27 10:05:09 -0400 (Wed, 27 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-3 + * tune for longer gaps, but it does not really work with kilo-bp gaps... + +------------------------------------------------------------------------ +r446 | lh3 | 2008-08-26 13:30:41 -0400 (Tue, 26 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-2 + * changed the way to extend long deletions. Now use max_del_occ. + +------------------------------------------------------------------------ +r445 | lh3 | 2008-08-26 13:05:58 -0400 (Tue, 26 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + +updated from bwtsw2_lite + +------------------------------------------------------------------------ +r436 | lh3 | 2008-08-23 12:28:44 -0400 (Sat, 23 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt_lite.c + A /branches/prog/bwa/bwt_lite.h + A /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-1 + * add bwt_lite: a light-weighted version of bwt (NOT TESTED!) + * add core codes for bwtsw2: NOT TESTED!!! + +------------------------------------------------------------------------ +r427 | lh3 | 2008-08-15 05:38:12 -0400 (Fri, 15 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + +Release bwa-0.2.0 + +------------------------------------------------------------------------ +r426 | lh3 | 2008-08-14 11:26:19 -0400 (Thu, 14 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.1.6-7 + * change default seed length to 31 + * add incomplete support to color sequences (not tested yet!) + +------------------------------------------------------------------------ +r425 | lh3 | 2008-08-14 06:23:11 -0400 (Thu, 14 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-6 + * change default seed length to 33bp + +------------------------------------------------------------------------ +r424 | lh3 | 2008-08-14 05:55:33 -0400 (Thu, 14 Aug 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-5 + * fixed a bug that may miss true alignments. this bugs exists in most + early versions. + * fixed a bug that yields wrong coordinates for reads mapped on the forward + strands with gaps. + +------------------------------------------------------------------------ +r423 | lh3 | 2008-08-14 04:07:28 -0400 (Thu, 14 Aug 2008) | 2 lines +Changed paths: + D /branches/prog/bwa/Makefile.div + +useless + +------------------------------------------------------------------------ +r422 | lh3 | 2008-08-13 19:21:14 -0400 (Wed, 13 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-4 + * fixed one bug + * there is another one... + +------------------------------------------------------------------------ +r421 | lh3 | 2008-08-13 18:23:33 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-3 + * almost there, but not quite right + +------------------------------------------------------------------------ +r419 | lh3 | 2008-08-13 17:27:02 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * improve the seeding method + * prepare to load two BWTs into memory. A BIG change! + +------------------------------------------------------------------------ +r418 | lh3 | 2008-08-13 10:56:54 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * added seeding + * unfinished yet + +------------------------------------------------------------------------ +r413 | lh3 | 2008-08-08 11:48:35 -0400 (Fri, 08 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.6 + +------------------------------------------------------------------------ +r410 | lh3 | 2008-08-06 15:48:22 -0400 (Wed, 06 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/simple_dp.c + +sw: output alignment score + +------------------------------------------------------------------------ +r407 | lh3 | 2008-08-04 10:01:20 -0400 (Mon, 04 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + A /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.1.5-3 + * added a simple interface to SW/NW alignment + * stdaln-0.9.8 (see header for more details) + +------------------------------------------------------------------------ +r406 | lh3 | 2008-08-01 19:21:59 -0400 (Fri, 01 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + A /branches/prog/bwa/stdaln.c + A /branches/prog/bwa/stdaln.h + + * bwa-0.1.5-2 + * give accurate gap positions + +------------------------------------------------------------------------ +r405 | lh3 | 2008-08-01 19:06:19 -0400 (Fri, 01 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + +unfinished, but I am tired... + +------------------------------------------------------------------------ +r401 | lh3 | 2008-07-30 05:59:24 -0400 (Wed, 30 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + + * bwa-0.1.5-1 + * fixed a potential bug which may produce an alignment in N regions, + although extremely rare. + +------------------------------------------------------------------------ +r399 | lh3 | 2008-07-27 11:41:52 -0400 (Sun, 27 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.5 + +------------------------------------------------------------------------ +r398 | lh3 | 2008-07-25 12:14:47 -0400 (Fri, 25 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r397 | lh3 | 2008-07-25 09:58:56 -0400 (Fri, 25 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * + +------------------------------------------------------------------------ +r396 | lh3 | 2008-07-25 06:42:01 -0400 (Fri, 25 Jul 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.4-4 + * add timer for debugging + +------------------------------------------------------------------------ +r395 | lh3 | 2008-07-24 05:46:21 -0400 (Thu, 24 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.4-3 + * fixed a bug in the previous code + * this version gives identical result to bwa-0.1.4, just 10% faster + +------------------------------------------------------------------------ +r394 | lh3 | 2008-07-24 05:18:53 -0400 (Thu, 24 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.4-2 + * further improve the speed + * The result is slightly different from bwa-0.1.4 now. I need to check... + +------------------------------------------------------------------------ +r393 | lh3 | 2008-07-23 12:04:16 -0400 (Wed, 23 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + +comments only + +------------------------------------------------------------------------ +r392 | lh3 | 2008-07-23 10:34:03 -0400 (Wed, 23 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + +further improve the speed in Occ functions + +------------------------------------------------------------------------ +r386 | lh3 | 2008-07-22 10:03:54 -0400 (Tue, 22 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.4 + +------------------------------------------------------------------------ +r385 | lh3 | 2008-07-22 09:44:50 -0400 (Tue, 22 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +update documentation and ChangeLog + +------------------------------------------------------------------------ +r384 | lh3 | 2008-07-22 08:50:03 -0400 (Tue, 22 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.3-2 + * fixed the bug in the last modification + * now the alignment should be more clearly defined + +------------------------------------------------------------------------ +r383 | lh3 | 2008-07-21 18:32:21 -0400 (Mon, 21 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.3-1 + * this is a buggy verion! + * i will fix the bug tomorrow. It is late... + +------------------------------------------------------------------------ +r381 | lh3 | 2008-07-21 06:45:32 -0400 (Mon, 21 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.3 + +------------------------------------------------------------------------ +r380 | lh3 | 2008-07-21 06:07:43 -0400 (Mon, 21 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-3 + * improve the speed for gcc on Intel Mac OS X, but not really on icc on Linux + * aln: more command-line options + +------------------------------------------------------------------------ +r373 | lh3 | 2008-07-17 09:09:46 -0400 (Thu, 17 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-2 + * further improve the speed + * this version gives exactly the same result as bwa-0.1.2 + +------------------------------------------------------------------------ +r372 | lh3 | 2008-07-17 07:51:08 -0400 (Thu, 17 Jul 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-1 + * speed up by about 5% + +------------------------------------------------------------------------ +r370 | lh3 | 2008-07-17 05:12:00 -0400 (Thu, 17 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.2 + +------------------------------------------------------------------------ +r368 | lh3 | 2008-07-16 08:51:25 -0400 (Wed, 16 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + D /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + D /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-9 + * some code cleanup + * remove 1away and top2 + +------------------------------------------------------------------------ +r367 | lh3 | 2008-07-16 08:24:34 -0400 (Wed, 16 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/is.c + +Yuta Mori's implementation of IS algorithm. + +------------------------------------------------------------------------ +r365 | lh3 | 2008-07-16 06:58:04 -0400 (Wed, 16 Jul 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.1-8 + * improve gapped alignment + * this version will miss more gapped alignments, but the speed is much faster + * prepare to remove top2 and 1away algorithms + * prepare to add SAIS algorithm for bwt construction + +------------------------------------------------------------------------ +r358 | lh3 | 2008-06-09 06:03:04 -0400 (Mon, 09 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-7 + * change END_SKIP from 3 to 5, but still gaps may be wrongly added + * change default '-g' from 5 to 3 + +------------------------------------------------------------------------ +r357 | lh3 | 2008-06-09 05:18:36 -0400 (Mon, 09 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-6 + * fix a bug in nested stack + +------------------------------------------------------------------------ +r356 | lh3 | 2008-06-08 18:43:13 -0400 (Sun, 08 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + A /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.1-5 + * replace heap with nested stacks + * there are still obvious bugs... + +------------------------------------------------------------------------ +r355 | lh3 | 2008-06-08 17:13:44 -0400 (Sun, 08 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.1.1-4 + * add interface to affine gap alignment + * there are obvious bugs and I will fix them later + +------------------------------------------------------------------------ +r354 | lh3 | 2008-06-08 15:39:05 -0400 (Sun, 08 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-3 + * affine gap seems to work, at least partially + +------------------------------------------------------------------------ +r353 | lh3 | 2008-06-08 09:27:18 -0400 (Sun, 08 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + A /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-2 + * initial gapped alignment. not work at the moment + +------------------------------------------------------------------------ +r352 | lh3 | 2008-06-06 04:37:34 -0400 (Fri, 06 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-1 + * ungap: remove a useless varible in top2_entry_t + +------------------------------------------------------------------------ +r348 | lh3 | 2008-06-03 09:04:12 -0400 (Tue, 03 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + A /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.1 + +------------------------------------------------------------------------ +r347 | lh3 | 2008-06-03 05:45:08 -0400 (Tue, 03 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r346 | lh3 | 2008-06-02 18:59:50 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + A /branches/prog/bwa/ChangeLog + A /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-11 + * improve approximating mapping qualities + * add documentation + * add ChangeLog + +------------------------------------------------------------------------ +r345 | lh3 | 2008-06-02 16:04:39 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-10 + * output a random position for repetitive reads + +------------------------------------------------------------------------ +r344 | lh3 | 2008-06-02 15:03:54 -0400 (Mon, 02 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/pac2bwt.c + + * bwa-0.1.0-9 + * fix memory leaks + * fix a potential bug in coverting to the real coordinate + +------------------------------------------------------------------------ +r343 | lh3 | 2008-06-02 13:44:51 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-8 + * fix a bug about strand + * update Makefile.div + * change top2b as the default method + +------------------------------------------------------------------------ +r342 | lh3 | 2008-06-02 11:23:26 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-7 + * use bwt_2occ() and bwt_2occ4() in other functions + +------------------------------------------------------------------------ +r341 | lh3 | 2008-06-02 09:31:39 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-6 + * fix a bug for missing hits + +------------------------------------------------------------------------ +r340 | lh3 | 2008-06-02 09:10:18 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-5 + * accelerate comparisons in heap, a bit + +------------------------------------------------------------------------ +r339 | lh3 | 2008-06-02 08:41:31 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-4 + * avoid marginal repeated calculation in occ + +------------------------------------------------------------------------ +r338 | lh3 | 2008-06-02 06:46:51 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-3 + * fix a bug caused by previours change + * fix a bug in heap + * order the heap by more criteria + +------------------------------------------------------------------------ +r337 | lh3 | 2008-06-01 19:11:15 -0400 (Sun, 01 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-2 + * also sort sa range in heapsort, in attempt to improve cache performance. + Unfortunately, it does not work well at all. + +------------------------------------------------------------------------ +r336 | lh3 | 2008-06-01 17:45:23 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + + * 0.1.0-1 + * fix a bug in calculating the real coordinate + +------------------------------------------------------------------------ +r335 | lh3 | 2008-06-01 16:03:09 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + +nothing, really + +------------------------------------------------------------------------ +r334 | lh3 | 2008-06-01 15:59:13 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/pac2bwt.c + +use IS algorithm by default + +------------------------------------------------------------------------ +r333 | lh3 | 2008-06-01 15:05:15 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/is.c + M /branches/prog/bwa/pac2bwt.c + + * a bit code clean up in is.c + * add IS algorithm for constructing BWT, albeit slower + +------------------------------------------------------------------------ +r332 | lh3 | 2008-06-01 13:23:08 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/is.c + +IS linear-time algorithm for constructing SA/BWT + +------------------------------------------------------------------------ +r331 | lh3 | 2008-06-01 10:35:26 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * fix a bug in generating .pac + * index in one go + +------------------------------------------------------------------------ +r330 | lh3 | 2008-06-01 09:17:05 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + +real coordinates can be ouput + +------------------------------------------------------------------------ +r329 | lh3 | 2008-05-31 19:21:02 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwttop2.c + +add top2e which is similar to 1away + +------------------------------------------------------------------------ +r328 | lh3 | 2008-05-31 18:46:12 -0400 (Sat, 31 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * unified cmd-line interface for ungapped alignment + * add two alternatives to top2 algorithm + +------------------------------------------------------------------------ +r327 | lh3 | 2008-05-31 18:14:46 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +add cmd-line interface to alntop2 + +------------------------------------------------------------------------ +r326 | lh3 | 2008-05-31 17:59:31 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + A /branches/prog/bwa/bwttop2.c + +top2 algorithm seems to work. I need to change interface, though + +------------------------------------------------------------------------ +r325 | lh3 | 2008-05-31 15:11:49 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt1away.c + +change the variable in the structure + +------------------------------------------------------------------------ +r324 | lh3 | 2008-05-31 14:52:13 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt1away.c + +set a slightly better bound on the maximum allowed mismatches + +------------------------------------------------------------------------ +r323 | lh3 | 2008-05-30 18:40:21 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + + * output time statistics + +------------------------------------------------------------------------ +r322 | lh3 | 2008-05-30 17:58:25 -0400 (Fri, 30 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + + * presumably better way to make use of prefix. But for the moment I do + not know whether it is correct or not. + * a bit code clean up: separate alignment part + +------------------------------------------------------------------------ +r321 | lh3 | 2008-05-30 13:57:43 -0400 (Fri, 30 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt_gen/Makefile + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * a bit code clean up + * put bwt_gen in bwa + +------------------------------------------------------------------------ +r320 | lh3 | 2008-05-30 11:40:11 -0400 (Fri, 30 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + + * improve cmd-line interface + * fix a bug in loading .sa + * change default sa interval to 32 + +------------------------------------------------------------------------ +r319 | lh3 | 2008-05-30 10:31:37 -0400 (Fri, 30 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + + * fix memory leak (I know that. Just a bit lazy) + * change to another method to do 1-away alignment + +------------------------------------------------------------------------ +r318 | lh3 | 2008-05-30 09:21:49 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +best unique match is partially finished + +------------------------------------------------------------------------ +r317 | lh3 | 2008-05-30 06:33:28 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +remove "ungapped" command and related codes + +------------------------------------------------------------------------ +r316 | lh3 | 2008-05-30 06:05:20 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + +change variable name thick to width + +------------------------------------------------------------------------ +r315 | lh3 | 2008-05-29 19:06:13 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + +revised algorithm for ungapped alignment. the old one can still be used. + +------------------------------------------------------------------------ +r314 | lh3 | 2008-05-29 16:36:11 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/pac2bwt.c + + * make commands more independent, but ungapped does not work at the moment + +------------------------------------------------------------------------ +r313 | lh3 | 2008-05-29 15:56:14 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt_gen/bwt_gen.c + +little... + +------------------------------------------------------------------------ +r312 | lh3 | 2008-05-29 15:54:01 -0400 (Thu, 29 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwt_gen/bwt_gen.h + + * add CopyRight information from the original codes + * do not dump .fmv files + +------------------------------------------------------------------------ +r311 | lh3 | 2008-05-29 15:44:36 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/bwt_gen + A /branches/prog/bwa/bwt_gen/Makefile + A /branches/prog/bwa/bwt_gen/QSufSort.c + A /branches/prog/bwa/bwt_gen/QSufSort.h + A /branches/prog/bwa/bwt_gen/bwt_gen.c + A /branches/prog/bwa/bwt_gen/bwt_gen.h + +codes from BWT-SW, for building BWT from packed file + +------------------------------------------------------------------------ +r310 | lh3 | 2008-05-28 17:03:35 -0400 (Wed, 28 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * change OCC_INTERVAL to 0x40, which makes bwa twice as fast. + * write Occ file as ".occ" as it is using a different interval from + .fmv, the BWT-SW correspondance of .occ + +------------------------------------------------------------------------ +r309 | lh3 | 2008-05-28 11:39:37 -0400 (Wed, 28 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + +fix a bug + +------------------------------------------------------------------------ +r308 | lh3 | 2008-05-28 09:56:16 -0400 (Wed, 28 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + +add heuristics to improve the speed, but I have not tested whether the +results are correct or not. + + +------------------------------------------------------------------------ +r307 | lh3 | 2008-05-28 06:31:34 -0400 (Wed, 28 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * make ungapped alignment basically works... + * but it is very slow in comparison to others... + * also I need to improve the interface... + * a lot of things to keep me busy today... + +------------------------------------------------------------------------ +r306 | lh3 | 2008-05-27 18:41:27 -0400 (Tue, 27 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtaln.c + + * remove recursion + * fixed a bug in bwt_occ() + +------------------------------------------------------------------------ +r305 | lh3 | 2008-05-27 16:59:44 -0400 (Tue, 27 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtaln.c + + * bwa now tells whether a sequenced can be mapped with maximum allowed + mismatches. ONLY ungapped. + * this is a recursive version. I will remove recursion later. + + +------------------------------------------------------------------------ +r304 | lh3 | 2008-05-27 09:12:17 -0400 (Tue, 27 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + A /branches/prog/bwa/bwtaln.c + A /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/utils.c + + * load .sa and .fmv files + * exact alignment now works + +------------------------------------------------------------------------ +r303 | lh3 | 2008-05-27 06:33:38 -0400 (Tue, 27 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +add xassert and fix a bug + +------------------------------------------------------------------------ +r302 | lh3 | 2008-05-27 06:23:20 -0400 (Tue, 27 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtio.c + A /branches/prog/bwa/utils.c + A /branches/prog/bwa/utils.h + +improve error message and error handling + +------------------------------------------------------------------------ +r301 | lh3 | 2008-05-27 05:37:51 -0400 (Tue, 27 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + A /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * move I/O codes to bwtio.c + * SA can be dumped and interestingly, it is identical to BWTSW + * now, .fmv is still different from BWTSW + +------------------------------------------------------------------------ +r299 | lh3 | 2008-05-26 18:07:44 -0400 (Mon, 26 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + +generate/retrieve SA and Occ + +------------------------------------------------------------------------ +r298 | lh3 | 2008-05-26 13:16:49 -0400 (Mon, 26 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + + * retrieve occ value at any position + * move bwt_cal_occ() to bwt.c + +------------------------------------------------------------------------ +r297 | lh3 | 2008-05-25 17:43:58 -0400 (Sun, 25 May 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwt.c + A /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * add bwt2fmv. It works to some extend. However, I do not understand + the purpose of some weird codes in BWT-SW. As a consequence, bwt2fmv + could generate a file almost identical, but not exactly identical, to + the .fmv file from BWT-SW. + + +------------------------------------------------------------------------ +r296 | lh3 | 2008-05-24 18:35:02 -0400 (Sat, 24 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + A /branches/prog/bwa/pac2bwt.c + +Burrows-Wheeler Transform now works. At least on one example, the +current code generates the same BWT as BWT-SW. Kind of magical, I would +say. :) + + +------------------------------------------------------------------------ +r295 | lh3 | 2008-05-24 11:25:31 -0400 (Sat, 24 May 2008) | 3 lines +Changed paths: + A /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/main.c + A /branches/prog/bwa/main.h + + * add Makefile and main.* + * improve interface to fa2bns, a bit + +------------------------------------------------------------------------ +r293 | lh3 | 2008-05-24 10:57:03 -0400 (Sat, 24 May 2008) | 3 lines +Changed paths: + A /branches/prog/bwa + A /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/bntseq.h + A /branches/prog/bwa/seq.c + A /branches/prog/bwa/seq.h + + * Burrow-Wheeler Alignment + * initial codes + +------------------------------------------------------------------------ diff --git a/src/bwa/Makefile b/src/bwa/Makefile new file mode 100644 index 000000000..247ca04e5 --- /dev/null +++ b/src/bwa/Makefile @@ -0,0 +1,88 @@ +CC= gcc +#CC= clang --analyze +CFLAGS= -g -Wall -Wno-unused-function -O2 +WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS +AR= ar +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o +AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + is.o bwtindex.o bwape.o kopen.o pemerge.o \ + bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ + bwtsw2_chain.o fastmap.o bwtsw2_pair.o +PROG= bwa +INCLUDES= +LIBS= -lm -lz -lpthread +SUBDIRS= . + +ifeq ($(shell uname -s),Linux) + LIBS += -lrt +endif + +.SUFFIXES:.c .o .cc + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +##all:$(PROG) +all:libbwa.a + +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + +bwamem-lite:libbwa.a example.o + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) + +libbwa.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +clean: + rm -f gmon.out *.o a.out $(PROG) *~ *.a + +depend: + ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) + +# DO NOT DELETE THIS LINE -- make depend depends on it. + +QSufSort.o: QSufSort.h +bamlite.o: bamlite.h malloc_wrap.h +bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h +bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h +bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h +bwamem.o: ksort.h utils.h kbtree.h +bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h +bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h +bwamem_pair.o: utils.h ksw.h +bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h +bwape.o: ksw.h khash.h +bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h +bwase.o: bwa.h ksw.h +bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h +bwashm.o: bwa.h bntseq.h bwt.h +bwt.o: utils.h bwt.h kvec.h malloc_wrap.h +bwt_gen.o: QSufSort.h malloc_wrap.h +bwt_lite.o: bwt_lite.h malloc_wrap.h +bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h +bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h +bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h +bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h +bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h +bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h +bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h +bwtsw2_core.o: khash.h ksort.h +bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h +bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h +bwtsw2_pair.o: malloc_wrap.h ksw.h +example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h +fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h +is.o: malloc_wrap.h +kopen.o: malloc_wrap.h +kstring.o: kstring.h malloc_wrap.h +ksw.o: ksw.h malloc_wrap.h +main.o: kstring.h malloc_wrap.h utils.h +malloc_wrap.o: malloc_wrap.h +pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h +utils.o: utils.h ksort.h malloc_wrap.h kseq.h + +EMPTY_AUTOMAKE_TARGETS = dvi pdf ps info html tags ctags +.PHONY: $(EMPTY_AUTOMAKE_TARGETS) +$(EMPTY_AUTOMAKE_TARGETS): diff --git a/src/bwa/NEWS.md b/src/bwa/NEWS.md new file mode 100644 index 000000000..4692889e4 --- /dev/null +++ b/src/bwa/NEWS.md @@ -0,0 +1,1146 @@ +Release 0.7.12 (28 December 2014) +--------------------------------- + +This release fixed a bug in the pair-end mode when ALT contigs are present. It +leads to undercalling in regions overlapping ALT contigs. + +(0.7.12: 28 December 2014, r1039) + + + +Release 0.7.11 (23 December, 2014) +---------------------------------- + +A major change to BWA-MEM is the support of mapping to ALT contigs in addition +to the primary assembly. Part of the ALT mapping strategy is implemented in +BWA-MEM and the rest in a postprocessing script for now. Due to the extra +layer of complexity on generating the reference genome and on the two-step +mapping, we start to provide a wrapper script and precompiled binaries since +this release. The package may be more convenient to some specific use cases. +For general uses, the single BWA binary still works like the old way. + +Another major addition to BWA-MEM is HLA typing, which made possible with the +new ALT mapping strategy. Necessary data and programs are included in the +binary release. The wrapper script also optionally performs HLA typing when HLA +genes are included in the reference genome as additional ALT contigs. + +Other notable changes to BWA-MEM: + + * Added option `-b` to `bwa index`. This option tunes the batch size used in + the construction of BWT. It is advised to use large `-b` for huge reference + sequences such as the BLAST *nt* database. + + * Optimized for PacBio data. This includes a change to scoring based on a + study done by Aaron Quinlan and a heuristic speedup. Further speedup is + possible, but needs more careful investigation. + + * Dropped PacBio read-to-read alignment for now. BWA-MEM is good for finding + the best hit, but is not very sensitive to suboptimal hits. Option `-x pbread` + is still available, but hidden on the command line. This may be removed in + future releases. + + * Added a new pre-setting for Oxford Nanopore 2D reads. LAST is still a little + more sensitive on older bacterial data, but bwa-mem is as good on more + recent data and is times faster for mapping against mammalian genomes. + + * Added LAST-like seeding. This improves the accuracy for longer reads. + + * Added option `-H` to insert arbitrary header lines. + + * Smarter option `-p`. Given an interleaved FASTQ stream, old bwa-mem identifies + the 2i-th and (2i+1)-th reads as a read pair. The new verion identifies + adjacent reads with the same read name as a read pair. It is possible to mix + single-end and paired-end reads in one FASTQ. + + * Improved parallelization. Old bwa-mem waits for I/O. The new version puts + I/O on a separate thread. It performs mapping while reading FASTQ and + writing SAM. This saves significant wall-clock time when reading from + or writing to a slow Unix pipe. + +With the new release, the recommended way to map Illumina reads to GRCh38 is to +use the bwakit binary package: + + bwa.kit/run-gen-ref hs38DH + bwa.kit/bwa index hs38DH.fa + bwa.kit/run-bwamem -t8 -H -o out-prefix hs38DH.fa read1.fq.gz read2.fq.gz | sh + +Please check bwa.kit/README.md for details and command line options. + +(0.7.11: 23 December 2014, r1034) + + + +Release 0.7.10 (13 July, 2014) +------------------------------ + +Notable changes to BWA-MEM: + + * Fixed a segmentation fault due to an alignment bridging the forward-reverse + boundary. This is a bug. + + * Use the PacBio heuristic to map contigs to the reference genome. The old + heuristic evaluates the necessity of full extension for each chain. This may + not work in long low-complexity regions. The PacBio heuristic performs + SSE2-SW around each short seed. It works better. Note that the heuristic is + only applied to long query sequences. For Illumina reads, the output is + identical to the previous version. + +(0.7.10: 13 July 2014, r789) + + + +Release 0.7.9 (19 May, 2014) +---------------------------- + +This release brings several major changes to BWA-MEM. Notably, BWA-MEM now +formally supports PacBio read-to-reference alignment and experimentally supports +PacBio read-to-read alignment. BWA-MEM also runs faster at a minor cost of +accuracy. The speedup is more significant when GRCh38 is in use. More +specifically: + + * Support PacBio subread-to-reference alignment. Although older BWA-MEM works + with PacBio data in principle, the resultant alignments are frequently + fragmented. In this release, we fine tuned existing methods and introduced + new heuristics to improve PacBio alignment. These changes are not used by + default. Users need to add option "-x pacbio" to enable the feature. + + * Support PacBio subread-to-subread alignment (EXPERIMENTAL). This feature is + enabled with option "-x pbread". In this mode, the output only gives the + overlapping region between a pair of reads without detailed alignment. + + * Output alternative hits in the XA tag if there are not so many of them. This + is a BWA-backtrack feature. + + * Support mapping to ALT contigs in GRCh38 (EXPERIMENTAL). We provide a script + to postprocess hits in the XA tag to adjust the mapping quality and generate + new primary alignments to all overlapping ALT contigs. We would *NOT* + recommend this feature for production uses. + + * Improved alignments to many short reference sequences. Older BWA-MEM may + generate an alignment bridging two or more adjacent reference sequences. + Such alignments are split at a later step as postprocessing. This approach + is complex and does not always work. This release forbids these alignments + from the very beginning. BWA-MEM should not produce an alignment bridging + two or more reference sequences any more. + + * Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum + rounds of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to + lower the mapping quality if a read contains seeds with excessive + occurrences. These changes make BWA-MEM faster at a minor cost of accuracy + in highly repetitive regions. + + * Added an option "-Y" to use soft clipping for supplementary alignments. + + * Bugfix: incomplete alignment extension in corner cases. + + * Bugfix: integer overflow when aligning long query sequences. + + * Bugfix: chain score is not computed correctly (almost no practical effect) + + * General code cleanup + + * Added FAQs to README + +Changes in BWA-backtrack: + + * Bugfix: a segmentation fault when an alignment stands out of the end of the + last chromosome. + +(0.7.9: 19 May 2014, r783) + + + +Release 0.7.8 (31 March, 2014) +------------------------------ + +Changes in BWA-MEM: + + * Bugfix: off-diagonal X-dropoff (option -d) not working as intended. + Short-read alignment is not affected. + + * Bugfix: unnecessarily large bandwidth used during global alignment, + which reduces the mapping speed by -5% for short reads. Results are not + affected. + + * Bugfix: when the matching score is not one, paired-end mapping quality is + inaccurate. + + * When the matching score (option -A) is changed, scale all score-related + options accordingly unless overridden by users. + + * Allow to specify different gap open (or extension) penalties for deletions + and insertions separately. + + * Allow to specify the insert size distribution. + + * Better and more detailed debugging information. + +With the default setting, 0.7.8 and 0.7.7 gave identical output on one million +100bp read pairs. + +(0.7.8: 31 March 2014, r455) + + + +Release 0.7.7 (25 Feburary, 2014) +--------------------------------- + +This release fixes incorrect MD tags in the BWA-MEM output. + +A note about short-read mapping to GRCh38. The new human reference genome +GRCh38 contains 60Mbp program generated alpha repeat arrays, some of which are +hard masked as they cannot be localized. These highly repetitive arrays make +BWA-MEM -50% slower. If you are concerned with the performance of BWA-MEM, you +may consider to use option "-c2000 -m50". On simulated data, this setting helps +the performance at a very minor cost on accuracy. I may consider to change the +default in future releases. + +(0.7.7: 25 Feburary 2014, r441) + + + +Release 0.7.6 (31 Januaray, 2014) +--------------------------------- + +Changes in BWA-MEM: + + * Changed the way mapping quality is estimated. The new method tends to give + the same alignment a higher mapping quality. On paired-end reads, the change + is minor as with pairing, the mapping quality is usually high. For short + single-end reads, the difference is considerable. + + * Improved load balance when many threads are spawned. However, bwa-mem is + still not very thread efficient, probably due to the frequent heap memory + allocation. Further improvement is a little difficult and may affect the + code stability. + + * Allow to use different clipping penalties for 5'- and 3'-ends. This helps + when we do not want to clip one end. + + * Print the @PG line, including the command line options. + + * Improved the band width estimate: a) fixed a bug causing the band + width extimated from extension not used in the final global alignment; b) + try doubled band width if the global alignment score is smaller. + Insufficient band width leads to wrong CIGAR and spurious mismatches/indels. + + * Added a new option -D to fine tune a heuristic on dropping suboptimal hits. + Reducing -D increases accuracy but decreases the mapping speed. If unsure, + leave it to the default. + + * Bugfix: for a repetitive single-end read, the reported hit is not randomly + distributed among equally best hits. + + * Bugfix: missing paired-end hits due to unsorted list of SE hits. + + * Bugfix: incorrect CIGAR caused by a defect in the global alignment. + + * Bugfix: incorrect CIGAR caused by failed SW rescue. + + * Bugfix: alignments largely mapped to the same position are regarded to be + distinct from each other, which leads to underestimated mapping quality. + + * Added the MD tag. + +There are no changes to BWA-backtrack in this release. However, it has a few +known issues yet to be fixed. If you prefer BWA-track, It is still advised to +use bwa-0.6.x. + +While I developed BWA-MEM, I also found a few issues with BWA-SW. It is now +possible to improve BWA-SW with the lessons learned from BWA-MEM. However, as +BWA-MEM is usually better, I will not improve BWA-SW until I find applications +where BWA-SW may excel. + +(0.7.6: 31 January 2014, r432) + + + +Release 0.7.5a (30 May, 2013) +----------------------------- + +Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare +cases. + +(0.7.5a: 30 May 2013, r405) + + + +Release 0.7.5 (29 May, 2013) +---------------------------- + +Changes in all components: + + * Improved error checking on memory allocation and file I/O. Patches provided + by Rob Davies. + + * Updated README. + + * Bugfix: return code is zero upon errors. + +Changes in BWA-MEM: + + * Changed the way a chimeric alignment is reported (conforming to the upcoming + SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired + or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100 + bits. All the other hits part of the chimeric alignment will use hard + clipping and be marked with 0x800 if option "-M" is not in use, or marked + with 0x100 otherwise. + + * Other hits part of a chimeric alignment are now reported in the SA tag, + conforming to the SAM spec v1.5. + + * Better method for resolving an alignment bridging two or more short + reference sequences. The current strategy maps the query to the reference + sequence that covers the middle point of the alignment. For most + applications, this change has no effects. + +Changes in BWA-backtrack: + + * Added a magic number to .sai files. This prevents samse/sampe from reading + corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai + generated by a different version of bwa. + + * Bugfix: alignments in the XA:Z: tag were wrong. + + * Keep track of #ins and #del during backtracking. This simplifies the code + and reduces errors in rare corner cases. I should have done this in the + early days of bwa. + +In addition, if you use BWA-MEM or the fastmap command of BWA, please cite: + + - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN]. + +Thank you. + +(0.7.5: 29 May 2013, r404) + + + +Release 0.7.4 (23 April, 2013) +------------------------------ + +This is a bugfix release. Most of bugs are considered to be minor which only +occur very rarely. + + * Bugfix: wrong CIGAR when a query sequence bridges three or more target + sequences. This only happens when aligning reads to short assembly contigs. + + * Bugfix: leading "D" operator in CIGAR. + + * Extend more seeds for better alignment around tandem repeats. This is also + a cause of the leading "D" operator in CIGAR. + + * Bugfix: SSE2-SSW may occasionally find incorrect query starting position + around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and + a wrong CIGAR in BWA. + + * Bugfix: clipping penalty does not work as is intended when there is a gap + towards the end of a read. + + * Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin, + fread() is unable to read a data block longer than 2GB due to an integer + overflow bug in its implementation. + +Since version 0.7.4, BWA-MEM is considered to reach similar stability to +BWA-backtrack for short-read mapping. + +(0.7.4: 23 April, r385) + + + +Release 0.7.3a (15 March, 2013) +------------------------------- + +In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed +in another corner case. + +(0.7.3a: 15 March 2013, r367) + + + +Release 0.7.3 (15 March, 2013) +------------------------------ + +Changes to BWA-MEM: + + * Bugfix: pairing score is inaccurate when option -A does not take the default + value. This is a very minor issue even if it happens. + + * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there + is a 1bp deletion and a 1bp insertion which are close to the end of the + reads, and there are no other substitutions or indels. BWA-MEM would not do + a gapped alignment due to the bug. + + * New feature: output other non-overlapping alignments in the XP tag such that + we can see the entire picture of alignment from one SAM line. XP gives the + position, CIGAR, NM and mapQ of each aligned subsequence of the query. + +BWA-MEM has been used to align -300Gbp 100-700bp SE/PE reads. SNP/indel calling +has also been evaluated on part of these data. BWA-MEM generally gives better +pre-filtered SNP calls than BWA. No significant issues have been observed since +0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release) +are still possible. If you find potential issues, please send bug reports to + (free registration required). + +In addition, more detailed description of the BWA-MEM algorithm can be found at +. + +(0.7.3: 15 March 2013, r366) + + + +Release 0.7.2 (9 March, 2013) +----------------------------- + +Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, +flagging 'properly paired' also gets improved a little. + +(0.7.2: 9 March 2013, r351) + + + +Release 0.7.1 (8 March, 2013) +----------------------------- + +Changes to BWA-MEM: + + * Bugfix: rare segmentation fault caused by a partial hit to the end of the + last sequence. + + * Bugfix: occasional mis-pairing given an interleaved fastq. + + * Bugfix: wrong mate information when the mate is unmapped. SAM generated by + BWA-MEM can now be validated with Picard. + + * Improved the performance and accuracy for ultra-long query sequences. + Short-read alignment is not affected. + +Changes to other components: + + * In BWA-backtrack and BWA-SW, replaced the code for global alignment, + Smith-Waterman and SW extension. The performance and accuracy of the two + algorithms stay the same. + + * Added an experimental subcommand to merge overlapping paired ends. The + algorithm is very conservative: it may miss true overlaps but rarely makes + mistakes. + +An important note is that like BWA-SW, BWA-MEM may output multiple primary +alignments for a read, which may cause problems to some tools. For aligning +sequence reads, it is advised to use '-M' to flag extra hits as secondary. This +option is not the default because multiple primary alignments are theoretically +possible in sequence alignment. + +(0.7.1: 8 March 2013, r347) + + + +Beta Release 0.7.0 (28 Feburary, 2013) +-------------------------------------- + +This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query +sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap +algorithm and extends seeds with banded affine-gap-penalty dynamic programming +(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or +longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA +and BWA-SW and is more accurate. It also supports split alignments like BWA-SW +and may optionally output multiple hits like BWA. BWA-MEM does not guarantee +to find hits within a certain edit distance, but BWA is not efficient for such +task given longer reads anyway, and the edit-distance criterion is arguably +not as important in long-read alignment. + +In addition to the algorithmic improvements, BWA-MEM also implements a few +handy features in practical aspects: + + 1. BWA-MEM automatically switches between local and glocal (global wrt reads; + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. + + 2. BWA-MEM automatically infers pair orientation from a batch of single-end + alignments. It allows more than one orientations if there are sufficient + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) + + 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It + is possible to convert a name-sorted BAM to an interleaved fastq on the fly + and feed the data stream to BWA-MEM for mapping. + + 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which + helps to transfer individual read annotations to the output. + + 5. BWA-MEM supports more advanced piping. Users can now run: + (bwa mem ref.fa '20) CPU cores. + + * Check I/O error. + + * Increased the maximum barcode length to 63bp. + + * Automatically choose the indexing algorithm. + + * Bugfix: very rare segfault due to an uninitialized variable. The bug also + affects the placement of suboptimal alignments. The effect is very minor. + +This release involves quite a lot of tricky changes. Although it has been +tested on a few data sets, subtle bugs may be still hidden. It is *NOT* +recommended to use this release in a production pipeline. In future, however, +BWA-SW may be better when reads continue to go longer. I would encourage users +to try the 0.6 release. I would also like to hear the users' experience. Thank +you. + +(0.6.0: 12 November 2011, r85) + + + +Beta Release 0.5.9 (24 January, 2011) +------------------------------------- + +Notable changes: + + * Feature: barcode support via the '-B' option. + + * Feature: Illumina 1.3+ read format support via the '-I' option. + + * Bugfix: RG tags are not attached to unmapped reads. + + * Bugfix: very rare bwasw mismappings + + * Recommend options for PacBio reads in bwasw help message. + + +Also, since January 13, the BWA master repository has been moved to github: + + https://github.com/lh3/bwa + +The revision number has been reset. All recent changes will be first +committed to this repository. + +(0.5.9: 24 January 2011, r16) + + + +Beta Release Candidate 0.5.9rc1 (10 December, 2010) +--------------------------------------------------- + +Notable changes in bwasw: + + * Output unmapped reads. + + * For a repetitive read, choose a random hit instead of a fixed + one. This is not well tested. + +Notable changes in bwa-short: + + * Fixed a bug in the SW scoring system, which may lead to unexpected + gaps towards the end of a read. + + * Fixed a bug which invalidates the randomness of repetitive reads. + + * Fixed a rare memory leak. + + * Allowed to specify the read group at the command line. + + * Take name-grouped BAM files as input. + +Changes to this release are usually safe in that they do not interfere +with the key functionality. However, the release has only been tested on +small samples instead of on large-scale real data. If anything weird +happens, please report the bugs to the bio-bwa-help mailing list. + +(0.5.9rc1: 10 December 2010, r1561) + + + +Beta Release 0.5.8 (8 June, 2010) +--------------------------------- + +Notable changes in bwasw: + + * Fixed an issue of missing alignments. This should happen rarely and + only when the contig/read alignment is multi-part. Very rarely, bwasw + may still miss a segment in a multi-part alignment. This is difficult + to fix, although possible. + +Notable changes in bwa-short: + + * Discard the SW alignment when the best single-end alignment is much + better. Such a SW alignment may caused by structural variations and + forcing it to be aligned leads to false alignment. This fix has not + been tested thoroughly. It would be great to receive more users + feedbacks on this issue. + + * Fixed a typo/bug in sampe which leads to unnecessarily large memory + usage in some cases. + + * Further reduced the chance of reporting 'weird pairing'. + +(0.5.8: 8 June 2010, r1442) + + + +Beta Release 0.5.7 (1 March, 2010) +---------------------------------- + +This release only has an effect on paired-end data with fat insert-size +distribution. Users are still recommended to update as the new release +improves the robustness to poor data. + + * The fix for 'weird pairing' was not working in version 0.5.6, pointed + out by Carol Scott. It should work now. + + * Optionally output to a normal file rather than to stdout (by Tim + Fennel). + +(0.5.7: 1 March 2010, r1310) + + + +Beta Release 0.5.6 (10 Feburary, 2010) +-------------------------------------- + +Notable changes in bwa-short: + + * Report multiple hits in the SAM format at a new tag XA encoded as: + (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has + 4 or fewer hits, they will all be reported; if a read in a anomalous + pair has 11 or fewer hits, all of them will be reported. + + * Perform Smith-Waterman alignment also for anomalous read pairs when + both ends have quality higher than 17. This reduces false positives + for some SV discovery algorithms. + + * Do not report "weird pairing" when the insert size distribution is + too fat or has a mean close to zero. + + * If a read is bridging two adjacent chromsomes, flag it as unmapped. + + * Fixed a small but long existing memory leak in paired-end mapping. + + * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly + parsed by solid2fastq.pl; b) truncated quality string is resolved; c) + SOLiD read mapped to the reverse strand is complemented. + + * Bwa now calculates skewness and kurtosis of the insert size + distribution. + + * Deploy a Bayesian method to estimate the maximum distance for a read + pair considered to be paired properly. The method is proposed by + Gerton Lunter, but bwa only implements a simplified version. + + * Export more functions for Java bindings, by Matt Hanna (See: + http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings) + + * Abstract bwa CIGAR for further extension, by Rodrigo Goya. + +(0.5.6: 10 Feburary 2010, r1303) + + + +Beta Release 0.5.5 (10 November, 2009) +-------------------------------------- + +This is a bug fix release: + + * Fixed a serious bug/typo in aln which does not occur given short + reads, but will lead to segfault for >500bp reads. Of course, the aln + command is not recommended for reads longer than 200bp, but this is a + bug anyway. + + * Fixed a minor bug/typo which leads to incorrect single-end mapping + quality when one end is moved to meet the mate-pair requirement. + + * Fixed a bug in samse for mapping in the color space. This bug is + caused by quality filtration added since 0.5.1. + +(0.5.5: 10 November 2009, r1273) + + + +Beta Release 0.5.4 (9 October, 2009) +------------------------------------ + +Since this version, the default seed length used in the "aln" command is +changed to 32. + +Notable changes in bwa-short: + + * Added a new tag "XC:i" which gives the length of clipped reads. + + * In sampe, skip alignments in case of a bug in the Smith-Waterman + alignment module. + + * In sampe, fixed a bug in pairing when the read sequence is identical + to its reverse complement. + + * In sampe, optionally preload the entire FM-index into memory to + reduce disk operations. + +Notable changes in dBWT-SW/BWA-SW: + + * Changed name dBWT-SW to BWA-SW. + + * Optionally use "hard clipping" in the SAM output. + +(0.5.4: 9 October 2009, r1245) + + + +Beta Release 0.5.3 (15 September, 2009) +--------------------------------------- + +Fixed a critical bug in bwa-short: reads mapped to the reverse strand +are not complemented. + +(0.5.3: 15 September 2009, r1225) + + + +Beta Release 0.5.2 (13 September, 2009) +--------------------------------------- + +Notable changes in bwa-short: + + * Optionally trim reads before alignment. See the manual page on 'aln + -q' for detailed description. + + * Fixed a bug in calculating the NM tag for a gapped alignment. + + * Fixed a bug given a mixture of reads with some longer than the seed + length and some shorter. + + * Print SAM header. + +Notable changes in dBWT-SW: + + * Changed the default value of -T to 30. As a result, the accuracy is a + little higher for short reads at the cost of speed. + +(0.5.2: 13 September 2009, r1223) + + + +Beta Release 0.5.1 (2 September, 2009) +-------------------------------------- + +Notable changes in the short read alignment component: + + * Fixed a bug in samse: do not write mate coordinates. + +Notable changes in dBWT-SW: + + * Randomly choose one alignment if the read is a repetitive. + + * Fixed a flaw when a read is mapped across two adjacent reference + sequences. However, wrong alignment reports may still occur rarely in + this case. + + * Changed the default band width to 50. The speed is slower due to this + change. + + * Improved the mapping quality a little given long query sequences. + +(0.5.1: 2 September 2009, r1209) + + + +Beta Release 0.5.0 (20 August, 2009) +------------------------------------ + +This release implements a novel algorithm, dBWT-SW, specifically +designed for long reads. It is 10-50 times faster than SSAHA2, depending +on the characteristics of the input data, and achieves comparable +alignment accuracy while allowing chimera detection. In comparison to +BLAT, dBWT-SW is several times faster and much more accurate especially +when the error rate is high. Please read the manual page for more +information. + +The dBWT-SW algorithm is kind of developed for future sequencing +technologies which produce much longer reads with a little higher error +rate. It is still at its early development stage. Some features are +missing and it may be buggy although I have evaluated on several +simulated and real data sets. But following the "release early" +paradigm, I would like the users to try it first. + +Other notable changes in BWA are: + + * Fixed a rare bug in the Smith-Waterman alignment module. + + * Fixed a rare bug about the wrong alignment coordinate when a read is + poorly aligned. + + * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in + a pair are unmapped. + +(0.5.0: 20 August 2009, r1200) + + + +Beta Release 0.4.9 (19 May, 2009) +--------------------------------- + +Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has +not in fact. Now I have fixed the bug. Sorry for this and thank Quan +Long for pointing out the bug (again). + +(0.4.9: 19 May 2009, r1075) + + + +Beta Release 0.4.8 (18 May, 2009) +--------------------------------- + +One change to "aln -R". Now by default, if there are no more than '-R' +equally best hits, bwa will search for suboptimal hits. This change +affects the ability in finding SNPs in segmental duplications. + +I have not tested this option thoroughly, but this simple change is less +likely to cause new bugs. Hope I am right. + +(0.4.8: 18 May 2009, r1073) + + + +Beta Release 0.4.7 (12 May, 2009) +--------------------------------- + +Notable changes: + + * Output SM (single-end mapping quality) and AM (smaller mapping + quality among the two ends) tag from sam output. + + * Improved the functionality of stdsw. + + * Made the XN tag more accurate. + + * Fixed a very rare segfault caused by integer overflow. + + * Improve the insert size estimation. + + * Fixed compiling errors for some Linux systems. + +(0.4.7: 12 May 2009, r1066) + + + +Beta Release 0.4.6 (9 March, 2009) +---------------------------------- + +This release improves the SOLiD support. First, a script for converting +SOLiD raw data is provided. This script is adapted from solid2fastq.pl +in the MAQ package. Second, a nucleotide reference file can be directly +used with 'bwa index'. Third, SOLiD paired-end support is +completed. Fourth, color-space reads will be converted to nucleotides +when SAM output is generated. Color errors are corrected in this +process. Please note that like MAQ, BWA cannot make use of the primer +base and the first color. + +In addition, the calculation of mapping quality is also improved a +little bit, although end-users may barely observe the difference. + +(0.4.6: 9 March 2009, r915) + + + +Beta Release 0.4.5 (18 Feburary, 2009) +-------------------------------------- + +Not much happened, but I think it would be good to let the users use the +latest version. + +Notable changes (Thank Bob Handsaker for catching the two bugs): + + * Improved bounary check. Previous version may still give incorrect + alignment coordinates in rare cases. + + * Fixed a bug in SW alignment when no residue matches. This only + affects the 'sampe' command. + + * Robustly estimate insert size without setting the maximum on the + command line. Since this release 'sampe -a' only has an effect if + there are not enough good pairs to infer the insert size + distribution. + + * Reduced false PE alignments a little bit by using the inferred insert + size distribution. This fix may be more important for long insert + size libraries. + +(0.4.5: 18 Feburary 2009, r829) + + + +Beta Release 0.4.4 (15 Feburary, 2009) +-------------------------------------- + +This is mainly a bug fix release. Notable changes are: + + * Imposed boundary check for extracting subsequence from the + genome. Previously this causes memory problem in rare cases. + + * Fixed a bug in failing to find whether an alignment overlapping with + N on the genome. + + * Changed MD tag to meet the latest SAM specification. + +(0.4.4: 15 Feburary 2009, r815) + + + +Beta Release 0.4.3 (22 January, 2009) +------------------------------------ + +Notable changes: + + * Treat an ambiguous base N as a mismatch. Previous versions will not + map reads containing any N. + + * Automatically choose the maximum allowed number of differences. This + is important when reads of different lengths are mixed together. + + * Print mate coordinate if only one end is unmapped. + + * Generate MD tag. This tag encodes the mismatching positions and the + reference bases at these positions. Deletions from the reference will + also be printed. + + * Optionally dump multiple hits from samse, in another concise format + rather than SAM. + + * Optionally disable iterative search. This is VERY SLOOOOW, though. + + * Fixed a bug in generate SAM. + +(0.4.3: 22 January 2009, r787) + + + +Beta Release 0.4.2 (9 January, 2009) +------------------------------------ + +Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if +there are no comment texts in the FASTA header. This is a critical +bug. Nothing else was changed. + +(0.4.2: 9 January 2009, r769) + + + +Beta Release 0.4.1 (7 January, 2009) +------------------------------------ + +I am sorry for the quick updates these days. I like to set a milestone +for BWA and this release seems to be. For paired end reads, BWA also +does Smith-Waterman alignment for an unmapped read whose mate can be +mapped confidently. With this strategy BWA achieves similar accuracy to +maq. Benchmark is also updated accordingly. + +(0.4.1: 7 January 2009, r760) + + + +Beta Release 0.4.0 (6 January, 2009) +------------------------------------ + +In comparison to the release two days ago, this release is mainly tuned +for performance with some tricks I learnt from Bowtie. However, as the +indexing format has also been changed, I have to increase the version +number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with +'bwa index'. + + * Improved the speed by about 20%. + + * Added multi-threading to 'bwa aln'. + +(0.4.0: 6 January 2009, r756) + + + +Beta Release 0.3.0 (4 January, 2009) +------------------------------------ + + * Added paired-end support by separating SA calculation and alignment + output. + + * Added SAM output. + + * Added evaluation to the documentation. + +(0.3.0: 4 January 2009, r741) + + + +Beta Release 0.2.0 (15 Augusst, 2008) +------------------------------------- + + * Take the subsequence at the 5'-end as seed. Seeding strategy greatly + improves the speed for long reads, at the cost of missing a few true + hits that contain many differences in the seed. Seeding also increase + the memory by 800MB. + + * Fixed a bug which may miss some gapped alignments. Fixing the bug + also slows the speed a little. + +(0.2.0: 15 August 2008, r428) + + + +Beta Release 0.1.6 (08 Augusst, 2008) +------------------------------------- + + * Give accurate CIGAR string. + + * Add a simple interface to SW/NW alignment + +(0.1.6: 08 August 2008, r414) + + + +Beta Release 0.1.5 (27 July, 2008) +---------------------------------- + + * Improve the speed. This version is expected to give the same results. + +(0.1.5: 27 July 2008, r400) + + + +Beta Release 0.1.4 (22 July, 2008) +---------------------------------- + + * Fixed a bug which may cause missing gapped alignments. + + * More clearly define what alignments can be found by BWA (See + manual). Now BWA runs a little slower because it will visit more + potential gapped alignments. + + * A bit code clean up. + +(0.1.4: 22 July 2008, r387) + + + +Beta Release 0.1.3 (21 July, 2008) +---------------------------------- + +Improve the speed with some tricks on retrieving occurences. The results +should be exactly the same as that of 0.1.2. + +(0.1.3: 21 July 2008, r382) + + + +Beta Release 0.1.2 (17 July, 2008) +---------------------------------- + +Support gapped alignment. Codes for ungapped alignment has been removed. + +(0.1.2: 17 July 2008, r371) + + + +Beta Release 0.1.1 (03 June, 2008) +----------------------------------- + +This is the first release of BWA, Burrows-Wheeler Alignment tool. Please +read man page for more information about this software. + +(0.1.1: 03 June 2008, r349) diff --git a/src/bwa/QSufSort.c b/src/bwa/QSufSort.c new file mode 100644 index 000000000..36c5a519a --- /dev/null +++ b/src/bwa/QSufSort.c @@ -0,0 +1,402 @@ +/* QSufSort.c + + Original source from qsufsort.c + + Copyright 1999, N. Jesper Larsson, all rights reserved. + + This file contains an implementation of the algorithm presented in "Faster + Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko + Sadakane (sada@is.s.u-tokyo.ac.jp). + + This software may be used freely for any purpose. However, when distributed, + the original source must be clearly stated, and, when the source code is + distributed, the copyright notice must be retained and any alterations in + the code must be clearly marked. No warranty is given regarding the quality + of this software. + + Modified by Wong Chi-Kwong, 2004 + + Changes summary: - Used long variable and function names + - Removed global variables + - Replace pointer references with array references + - Used insertion sort in place of selection sort and increased insertion sort threshold + - Reconstructing suffix array from inverse becomes an option + - Add handling where end-of-text symbol is not necessary < all characters + - Removed codes for supporting alphabet size > number of characters + + No warrenty is given regarding the quality of the modifications. + +*/ + + +#include +#include +#include +#include "QSufSort.h" + +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; + +// Static functions +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); + +/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size + n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original + contents of x[n] is disregarded, the n-th symbol being regarded as + end-of-string smaller than all other symbols.*/ +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform) +{ + qsint_t i, j; + qsint_t s, negatedSortedGroupLength; + qsint_t numSymbolAggregated; + qsint_t numSortedPos = 1; + qsint_t newAlphabetSize; + + if (!skipTransform) { + /* bucketing possible*/ + newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, + numChar, &numSymbolAggregated); + QSufSortBucketSort(V, I, numChar, newAlphabetSize); + I[0] = -1; + V[numChar] = 0; + numSortedPos = numSymbolAggregated; + } + + while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { + i = 0; + negatedSortedGroupLength = 0; + do { + s = I[i]; + if (s < 0) { + i -= s; /* skip over sorted group.*/ + negatedSortedGroupLength += s; + } else { + if (negatedSortedGroupLength) { + I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */ + negatedSortedGroupLength = 0; + } + j = V[s] + 1; + QSufSortSortSplit(V, I, i, j - 1, numSortedPos); + i = j; + } + } while (i <= numChar); + if (negatedSortedGroupLength) { + /* array ends with a sorted group.*/ + I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/ + } + numSortedPos *= 2; /* double sorted-depth.*/ + } +} + +void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) +{ + qsint_t i; + for (i=0; i<=numChar; i++) + I[V[i]] = i + 1; +} + +/* Sorting routine called for each unsorted group. Sorts the array of integers + (suffix numbers) of length n starting at p. The algorithm is a ternary-split + quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", + Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This + function is based on Program 7.*/ +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { + + qsint_t a, b, c, d; + qsint_t l, m; + qsint_t f, v, s, t; + qsint_t tmp; + qsint_t numItem; + + numItem = highestPos - lowestPos + 1; + + if (numItem <= INSERT_SORT_NUM_ITEM) { + QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar); + return; + } + + v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar); + + a = b = lowestPos; + c = d = highestPos; + + while (1) { + while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { + if (f == v) { + swap(I[a], I[b], tmp); + a++; + } + b++; + } + while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) { + if (f == v) { + swap(I[c], I[d], tmp); + d--; + } + c--; + } + if (b > c) + break; + swap(I[b], I[c], tmp); + b++; + c--; + } + + s = a - lowestPos; + t = b - a; + s = min(s, t); + for (l = lowestPos, m = b - s; m < b; l++, m++) { + swap(I[l], I[m], tmp); + } + + s = d - c; + t = highestPos - d; + s = min(s, t); + for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) { + swap(I[l], I[m], tmp); + } + + s = b - a; + t = d - c; + if (s > 0) + QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); + + // Update group number for equal portion + a = lowestPos + s; + b = highestPos - t; + if (a == b) { + // Sorted group + V[I[a]] = a; + I[a] = -1; + } else { + // Unsorted group + for (c=a; c<=b; c++) + V[I[c]] = b; + } + + if (t > 0) + QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); + +} + +/* Algorithm by Bentley & McIlroy.*/ +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { + + qsint_t m; + qsint_t keyl, keym, keyn; + qsint_t key1, key2, key3; + qsint_t s; + qsint_t numItem; + + numItem = highestPos - lowestPos + 1; + + m = lowestPos + numItem / 2; + + s = numItem / 8; + key1 = KEY(V, I, lowestPos, numSortedChar); + key2 = KEY(V, I, lowestPos+s, numSortedChar); + key3 = KEY(V, I, lowestPos+2*s, numSortedChar); + keyl = med3(key1, key2, key3); + key1 = KEY(V, I, m-s, numSortedChar); + key2 = KEY(V, I, m, numSortedChar); + key3 = KEY(V, I, m+s, numSortedChar); + keym = med3(key1, key2, key3); + key1 = KEY(V, I, highestPos-2*s, numSortedChar); + key2 = KEY(V, I, highestPos-s, numSortedChar); + key3 = KEY(V, I, highestPos, numSortedChar); + keyn = med3(key1, key2, key3); + + return med3(keyl, keym, keyn); + + +} + +/* Quadratic sorting method to use for small subarrays. */ +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) +{ + qsint_t i, j; + qsint_t tmpKey, tmpPos; + qsint_t numItem; + qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; + qsint_t negativeSortedLength; + qsint_t groupNum; + + numItem = highestPos - lowestPos + 1; + + for (i=0; i0 && key[j-1] > tmpKey; j--) { + key[j] = key[j-1]; + pos[j] = pos[j-1]; + } + key[j] = tmpKey; + pos[j] = tmpPos; + } + + negativeSortedLength = -1; + + i = numItem - 1; + groupNum = highestPos; + while (i > 0) { + I[i+lowestPos] = pos[i]; + V[I[i+lowestPos]] = groupNum; + if (key[i-1] == key[i]) { + negativeSortedLength = 0; + } else { + if (negativeSortedLength < 0) + I[i+lowestPos] = negativeSortedLength; + groupNum = i + lowestPos - 1; + negativeSortedLength--; + } + i--; + } + + I[lowestPos] = pos[0]; + V[I[lowestPos]] = groupNum; + if (negativeSortedLength < 0) + I[lowestPos] = negativeSortedLength; +} + +/* Bucketsort for first iteration. + + Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear + at least once. x[n] is 0. (This is the corresponding output of transform.) k + must be at most n+1. p is array of size n+1 whose contents are disregarded. + + Output: x is V and p is I after the initial sorting stage of the refined + suffix sorting algorithm.*/ + +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) +{ + qsint_t i, c; + qsint_t d; + qsint_t groupNum; + qsint_t currentIndex; + + // mark linked list empty + for (i=0; i0; i--) { + c = I[i-1]; + d = (qsint_t)(V[c]); + groupNum = currentIndex; + V[c] = groupNum; + if (d >= 0) { + I[currentIndex] = c; + while (d >= 0) { + c = d; + d = V[c]; + V[c] = groupNum; + currentIndex--; + I[currentIndex] = c; + } + } else { + // sorted group + I[currentIndex] = -1; + } + currentIndex--; + } +} + +/* Transforms the alphabet of x by attempting to aggregate several symbols into + one, while preserving the suffix order of x. The alphabet may also be + compacted, so that x on output comprises all integers of the new alphabet + with no skipped numbers. + + Input: x is an array of size n+1 whose first n elements are positive + integers in the range l...k-1. p is array of size n+1, used for temporary + storage. q controls aggregation and compaction by defining the maximum intue + for any symbol during transformation: q must be at least k-l; if q<=n, + compaction is guaranteed; if k-l>n, compaction is never done; if q is + INT_MAX, the maximum number of symbols are aggregated into one. + + Output: Returns an integer j in the range 1...q representing the size of the + new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is + set to the number of old symbols grouped into one. Only x[n] is 0.*/ +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) +{ + qsint_t c, i, j; + qsint_t a; // numSymbolAggregated + qsint_t mask; + qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; + qsint_t newAlphabetSize; + qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; + + maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; + + for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; + maxSymbol = QSINT_MAX >> maxNumBit; + + c = maxNumInputSymbol; + for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { + minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1); + maxSymbolInChunk = c; + c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol; + } + + mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ + V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ + + /* bucketing possible, compact alphabet.*/ + for (i=0; i<=maxSymbolInChunk; i++) + I[i] = 0; /* zero transformation table.*/ + c = minSymbolInChunk; + for (i=a; i<=numChar; i++) { + I[c] = 1; /* mark used chunk symbol.*/ + c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ + } + for (i=1; i number of characters + + No warrenty is given regarding the quality of the modifications. + +*/ + +#ifndef __QSUFSORT_H__ +#define __QSUFSORT_H__ + +#include + +#define KEY(V, I, p, h) ( V[ I[p] + h ] ) +#define INSERT_SORT_NUM_ITEM 16 + +typedef int64_t qsint_t; +#define QSINT_MAX INT64_MAX + +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform); +void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar); + + +#endif diff --git a/src/bwa/README-alt.md b/src/bwa/README-alt.md new file mode 100644 index 000000000..058ab7aa9 --- /dev/null +++ b/src/bwa/README-alt.md @@ -0,0 +1,178 @@ +## For the Impatient + +```sh +# Download bwakit (or from manually) +wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \ + | gzip -dc | tar xf - +# Generate the GRCh38+ALT+decoy+HLA and create the BWA index +bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa +bwa.kit/bwa index hs38DH.fa # create BWA index +# mapping +bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh # skip "|sh" to show command lines +``` + +This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA +genotypes on each gene and `out.hla.all` for other possible HLA genotypes. +Please check out [bwa/bwakit/README.md][kithelp] for details. + +## Background + +GRCh38 consists of several components: chromosomal assembly, unlocalized contigs +(chromosome known but location unknown), unplaced contigs (chromosome unknown) +and ALT contigs (long clustered variations). The combination of the first three +components is called the *primary assembly*. It is recommended to use the +complete primary assembly for all analyses. Using ALT contigs in read mapping is +tricky. + +GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary +assembly. However, sequences that are highly diverged from the primary assembly +only contribute a few million bp. Most subsequences of ALT contigs are nearly +identical to the primary assembly. If we align sequence reads to GRCh38+ALT +blindly, we will get many additional reads with zero mapping quality and miss +variants on them. It is crucial to make mappers aware of ALTs. + +BWA-MEM is ALT-aware. It essentially computes mapping quality across the +non-redundant content of the primary assembly plus the ALT contigs and is free +of the problem above. + +## Methods + +### Sequence alignment + +As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and +postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT +contigs are present. The following picture shows an example about how BWA-MEM +infers mapping quality and reports alignment after step 2: + +![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png) + +#### Step 1: BWA-MEM mapping + +At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring +the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*, +depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports +alignments and assigns mapQ following these two rules: + +1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of + an ALT hit is computed across all hits. + +2. If there are no non-ALT hits, the best ALT hit is outputted as the primary + alignment. If there are both ALT and non-ALT hits, non-ALT hits will be + primary and ALT hits be supplementary (SAM flag 0x800). + +In theory, non-ALT alignments from step 1 should be identical to alignments +against the reference genome with ALT contigs. In practice, the two types of +alignments may differ in rare cases due to seeding heuristics. When an ALT hit +is significantly better than non-ALT hits, BWA-MEM may miss seeds on the +non-ALT hits. + +If we don't care about ALT hits, we may skip postprocessing (step 2). +Nonetheless, postprocessing is recommended as it improves mapQ and gives more +information about ALT hits. + +#### Step 2: Postprocessing + +Postprocessing is done with a separate script `bwa-postalt.js`. It reads all +potential hits reported in the XA tag, lifts ALT hits to the chromosomal +positions using the ALT-to-ref alignment, groups them based on overlaps between +their lifted positions, and then re-estimates mapQ across the best scoring hit +in each group. Being aware of the ALT-to-ref alignment, this script can greatly +improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also +writes each hit overlapping the reported hit into a separate SAM line. This +enables variant calling on each ALT contig independent of others. + +### On the completeness of GRCh38+ALT + +While GRCh38 is much more complete than GRCh37, it is still missing some true +human sequences. To make sure every piece of sequence in the reference assembly +is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig +to have enough support from multiple sources before considering to add it to the +reference assembly. This careful and sophisticated procedure has left out some +sequences, one of which is [this example][novel], a 10kb contig assembled from +CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or +[BLAST][blast] to see where it maps. + +For a more complete reference genome, we compiled a new set of decoy sequences +from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples. +The sequences are included in `hs38DH-extra.fa` from the [BWA binary +package][res]. + +In addition to decoy, we also put multiple alleles of HLA genes in +`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb], +version 3.18.0 and are used to collect reads sequenced from these genes. + +### HLA typing + +HLA genes are known to be associated with many autoimmune diseases, infectious +diseases and drug responses. They are among the most important genes but are +rarely studied by WGS projects due to the high sequence divergence between +HLA genes and the reference genome in these regions. + +By including the HLA gene regions in the reference assembly as ALT contigs, we +are able to effectively identify reads coming from these genes. We also provide +a pipeline, which is included in the [BWA binary package][res], to type the +several classic HLA genes. The pipeline is conceptually simple. It de novo +assembles sequence reads mapped to each gene, aligns exon sequences of each +allele to the assembled contigs and then finds the pairs of alleles that best +explain the contigs. In practice, however, the completeness of IMGT/HLA and +copy-number changes related to these genes are not so straightforward to +resolve. HLA typing may not always be successful. Users may also consider to use +other programs for typing such as [Warren et al (2012)][hla4], [Liu et al +(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though +most of them are distributed under restrictive licenses. + +## Preliminary Evaluation + +To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878 +unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy +(hs38DH), and called small variants from the alignment. CHM1 is haploid. +Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The +true positive (TP) heterozygous calls from NA12878 are approximately equal +to the difference between NA12878 and CHM1 heterozygous calls. A better assembly +should yield higher TP and lower FP. The following table shows the numbers for +these assemblies: + +|Assembly|hs37 |hs38 |hs38DH|CHM1_1.1| huref| +|:------:|------:|------:|------:|------:|------:| +|FP | 255706| 168068| 142516|307172 | 575634| +|TP |2142260|2163113|2150844|2167235|2137053| + +With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces +FP by ~25k but also reduces TP by ~12k. We manually inspected variants called +from hs38 only and found the majority of them are associated with excessive read +depth, clustered variants or weak alignment. We believe most hs38-only calls are +problematic. In addition, if we compare two NA12878 replicates from HiSeq X10 +with nearly identical library construction, the difference is ~140k, an order +of magnitude higher than the difference between hs38 and hs38DH. ALT contigs, +decoy and HLA genes in hs38DH improve variant calling and enable the analyses of +ALT contigs and HLA typing at little cost. + +## Problems and Future Development + +There are some uncertainties about ALT mappings - we are not sure whether they +help biological discovery and don't know the best way to analyze them. Without +clear demand from downstream analyses, it is very difficult to design the +optimal mapping strategy. The current BWA-MEM method is just a start. If it +turns out to be useful in research, we will probably rewrite bwa-postalt.js in C +for performance; if not, we may make changes. It is also possible that we might +make breakthrough on the representation of multiple genomes, in which case, we +can even get rid of ALT contigs for good. + + + +[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit +[sb]: https://github.com/GregoryFaust/samblaster +[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/ +[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc +[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat +[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome +[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/ +[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/ +[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml +[hla1]: http://biorxiv.org/content/early/2014/07/08/006973 +[hlalink]: http://www.hladiseaseassociations.com +[hlatools]: https://www.biostars.org/p/93245/ +[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html +[hla3]: http://www.biomedcentral.com/1471-2164/15/325 +[hla4]: http://genomemedicine.com/content/4/12/95 +[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit diff --git a/src/bwa/README.md b/src/bwa/README.md new file mode 100644 index 000000000..9ac49bbec --- /dev/null +++ b/src/bwa/README.md @@ -0,0 +1,174 @@ +[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa) +[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest) +##Getting started + + git clone https://github.com/lh3/bwa.git + cd bwa; make + ./bwa index ref.fa + ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz + ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz + +##Introduction + +BWA is a software package for mapping DNA sequences against a large reference +genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the +support of long reads and chimeric alignment, but BWA-MEM, which is the latest, +is generally recommended as it is faster and more accurate. BWA-MEM also has +better performance than BWA-backtrack for 70-100bp Illumina reads. + +For all the algorithms, BWA first needs to construct the FM-index for the +reference genome (the **index** command). Alignment algorithms are invoked with +different sub-commands: **aln/samse/sampe** for BWA-backtrack, +**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. + +##Availability + +BWA is released under [GPLv3][1]. The latest source code is [freely +available at github][2]. Released packages can [be downloaded][3] at +SourceForge. After you acquire the source code, simply use `make` to compile +and copy the single executable `bwa` to the destination you want. The only +dependency required to build BWA is [zlib][14]. + +Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17]. +In addition to BWA, this self-consistent package also comes with bwa-associated +and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs, +adapter triming, duplicate marking, HLA typing and associated data files. + +##Seeking helps + +The detailed usage is described in the man page available together with the +source code. You can use `man ./bwa.1` to view the man page in a terminal. The +[HTML version][4] of the man page can be found at the [BWA website][5]. If you +have questions about BWA, you may [sign up the mailing list][6] and then send +the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions +in forums such as [BioStar][8] and [SEQanswers][9]. + +##Citing BWA + +* Li H. and Durbin R. (2009) Fast and accurate short read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID: + [19451168][10]]. (if you use the BWA-backtrack algorithm) + +* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID: + [20080505][11]]. (if you use the BWA-SW algorithm) + +* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM + algorithm or the **fastmap** command, or want to cite the whole BWA package) + +Please note that the last reference is a preprint hosted at [arXiv.org][13]. I +do not have plan to submit it to a peer-reviewed journal in the near future. + +##Frequently asked questions (FAQs) + +1. [What types of data does BWA work with?](#type) +2. [Why does a read appear multiple times in the output SAM?](#multihit) +3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb) +4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0) +5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref) +6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg) +7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt) + +####1. What types of data does BWA work with? + +BWA works with a variety types of DNA sequence data, though the optimal +algorithm and setting may vary. The following list gives the recommended +settings: + +* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly + contigs up to a few megabases mapped to a closely related reference genome: + + bwa mem ref.fa reads.fq > aln.sam + +* Illumina single-end reads shorter than ~70bp: + + bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam + +* Illumina/454/IonTorrent paired-end reads longer than ~70bp: + + bwa mem ref.fa read1.fq read2.fq > aln-pe.sam + +* Illumina paired-end reads shorter than ~70bp: + + bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai + bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam + +* PacBio subreads or Oxford Nanopore reads to a reference genome: + + bwa mem -x pacbio ref.fa reads.fq > aln.sam + bwa mem -x ont2d ref.fa reads.fq > aln.sam + +BWA-MEM is recommended for query sequences longer than ~70bp for a variety of +error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with +errors given longer query sequences as the chance of missing all seeds is small. +As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore +reads with a sequencing error rate over 20%. + +####2. Why does a read appear multiple times in the output SAM? + +BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene +fusion or a long deletion, a read bridging the break point may have two hits, +occupying two lines in the SAM output. With the default setting of BWA-MEM, one +and only one line is primary and is soft clipped; other lines are tagged with +0x800 SAM flag (supplementary alignment) and are hard clipped. + +####3. Does BWA work on reference sequences longer than 4GB in total? + +Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over +4GB. However, individual chromosome should not be longer than 2GB. + +####4. Why can one read in a pair has high mapping quality but the other has zero? + +This is correct. Mapping quality is assigned for individual read, not for a read +pair. It is possible that one read can be mapped unambiguously, but its mate +falls in a tandem repeat and thus its accurate position cannot be determined. + +####5. How can a BWA-backtrack alignment stands out of the end of a chromosome? + +Internally BWA concatenates all reference sequences into one long sequence. A +read may be mapped to the junction of two adjacent reference sequences. In this +case, BWA-backtrack will flag the read as unmapped (0x4), but you will see +position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment +as well. BWA-MEM does not have this problem. + +####6. Does BWA work with ALT contigs in the GRCh38 release? + +Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT. +BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please +see [README-alt.md][18] for details. Briefly, it is recommended to use +[bwakit][17], the binary release of BWA, for generating the reference genome +and for mapping. + +####7. Can I just run BWA-MEM against GRCh38+ALT without post-processing? + +If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM +without post-processing. The alignments produced this way are very close to +alignments against GRCh38 without ALT contigs. Nonetheless, applying +post-processing helps to reduce false mappings caused by reads from the +diverged part of ALT contigs and also enables HLA typing. It is recommended to +run the post-processing script. + + + +[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License +[2]: https://github.com/lh3/bwa +[3]: http://sourceforge.net/projects/bio-bwa/files/ +[4]: http://bio-bwa.sourceforge.net/bwa.shtml +[5]: http://bio-bwa.sourceforge.net/ +[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help +[7]: mailto:bio-bwa-help@sourceforge.net +[8]: http://biostars.org +[9]: http://seqanswers.com/ +[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168 +[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 +[12]: http://arxiv.org/abs/1303.3997 +[13]: http://arxiv.org/ +[14]: http://zlib.net/ +[15]: https://github.com/lh3/bwa/tree/mem +[16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/ +[17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/ +[18]: https://github.com/lh3/bwa/blob/master/README-alt.md diff --git a/src/bwa/bamlite.c b/src/bwa/bamlite.c new file mode 100644 index 000000000..3704bebcf --- /dev/null +++ b/src/bwa/bamlite.c @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include "bamlite.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +/********************* + * from bam_endian.c * + *********************/ + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +/************** + * from bam.c * + **************/ + +int bam_is_be; + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + if (header->target_name[i]) free(header->target_name[i]); + if (header->target_len) free(header->target_len); + free(header->target_name); + } + if (header->text) free(header->text); + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int magic_len; + int32_t i = 1, name_len; + // read "BAM1" + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); + return NULL; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + if (bam_read(fp, &header->l_text, 4) != 4) goto fail; + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail; + if (bam_read(fp, &header->n_targets, 4) != 4) goto fail; + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + if (bam_read(fp, &name_len, 4) != 4) goto fail; + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + if (bam_read(fp, header->target_name[i], name_len) != name_len) { + goto fail; + } + if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail; + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; + fail: + bam_header_destroy(header); + return NULL; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - sizeof(bam1_core_t); + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + return 4 + block_len; +} + + +#ifdef USE_VERBOSE_ZLIB_WRAPPERS +// Versions of gzopen, gzread and gzclose that print up error messages + +gzFile bamlite_gzopen(const char *fn, const char *mode) { + gzFile fp; + if (strcmp(fn, "-") == 0) { + fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if (!fp) { + fprintf(stderr, "Couldn't open %s : %s", + (strstr(mode, "r"))? "stdin" : "stdout", + strerror(errno)); + } + return fp; + } + if ((fp = gzopen(fn, mode)) == 0) { + fprintf(stderr, "Couldn't open %s : %s\n", fn, + errno ? strerror(errno) : "Out of memory"); + } + return fp; +} + +int bamlite_gzread(gzFile file, void *ptr, unsigned int len) { + int ret = gzread(file, ptr, len); + + if (ret < 0) { + int errnum = 0; + const char *msg = gzerror(file, &errnum); + fprintf(stderr, "gzread error: %s\n", + Z_ERRNO == errnum ? strerror(errno) : msg); + } + return ret; +} + +int bamlite_gzclose(gzFile file) { + int ret = gzclose(file); + if (Z_OK != ret) { + fprintf(stderr, "gzclose error: %s\n", + Z_ERRNO == ret ? strerror(errno) : zError(ret)); + } + + return ret; +} +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ diff --git a/src/bwa/bamlite.h b/src/bwa/bamlite.h new file mode 100644 index 000000000..efab7ac84 --- /dev/null +++ b/src/bwa/bamlite.h @@ -0,0 +1,114 @@ +#ifndef BAMLITE_H_ +#define BAMLITE_H_ + +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define USE_VERBOSE_ZLIB_WRAPPERS + +typedef gzFile bamFile; +#ifdef USE_VERBOSE_ZLIB_WRAPPERS +/* These print error messages on failure */ +# define bam_open(fn, mode) bamlite_gzopen(fn, mode) +# define bam_dopen(fd, mode) gzdopen(fd, mode) +# define bam_close(fp) bamlite_gzclose(fp) +# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size) +#else +# define bam_open(fn, mode) gzopen(fn, mode) +# define bam_dopen(fd, mode) gzdopen(fd, mode) +# define bam_close(fp) gzclose(fp) +# define bam_read(fp, buf, size) gzread(fp, buf, size) +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ + +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + size_t l_text, n_text; + char *text; +} bam_header_t; + +#define BAM_FPAIRED 1 +#define BAM_FPROPER_PAIR 2 +#define BAM_FUNMAP 4 +#define BAM_FMUNMAP 8 +#define BAM_FREVERSE 16 +#define BAM_FMREVERSE 32 +#define BAM_FREAD1 64 +#define BAM_FREAD2 128 +#define BAM_FSECONDARY 256 +#define BAM_FQCFAIL 512 +#define BAM_FDUP 1024 + +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 + +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) +#define bam1_qname(b) ((char*)((b)->data)) +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) +#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) +#define bam_destroy1(b) do { \ + if (b) { free((b)->data); free(b); } \ + } while (0) + +extern int bam_is_be; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_header_t *bam_header_init(void); + void bam_header_destroy(bam_header_t *header); + bam_header_t *bam_header_read(bamFile fp); + int bam_read1(bamFile fp, bam1_t *b); + +#ifdef USE_VERBOSE_ZLIB_WRAPPERS + gzFile bamlite_gzopen(const char *fn, const char *mode); + int bamlite_gzread(gzFile file, void *ptr, unsigned int len); + int bamlite_gzclose(gzFile file); +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bntseq.c b/src/bwa/bntseq.c new file mode 100644 index 000000000..465e38323 --- /dev/null +++ b/src/bwa/bntseq.c @@ -0,0 +1,446 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include +#include "bntseq.h" +#include "utils.h" + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +#include "khash.h" +KHASH_MAP_INIT_STR(str, int) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +unsigned char nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +void bns_dump(const bntseq_t *bns, const char *prefix) +{ + char str[1024]; + FILE *fp; + int i; + { // dump .ann + strcpy(str, prefix); strcat(str, ".ann"); + fp = xopen(str, "w"); + err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); + for (i = 0; i != bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + err_fprintf(fp, "%d %s", p->gi, p->name); + if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno); + else err_fprintf(fp, "\n"); + err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); + } + err_fflush(fp); + err_fclose(fp); + } + { // dump .amb + strcpy(str, prefix); strcat(str, ".amb"); + fp = xopen(str, "w"); + err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); + for (i = 0; i != bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); + } + err_fflush(fp); + err_fclose(fp); + } +} + +bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) +{ + char str[8192]; + FILE *fp; + const char *fname; + bntseq_t *bns; + long long xx; + int i; + int scanres; + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + { // read .ann + fp = xopen(fname = ann_filename, "r"); + scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + if (scanres != 3) goto badread; + bns->l_pac = xx; + bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); + for (i = 0; i < bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + char *q = str; + int c; + // read gi and sequence name + scanres = fscanf(fp, "%u%s", &p->gi, str); + if (scanres != 2) goto badread; + p->name = strdup(str); + // read fasta comments + while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + while (c != '\n' && c != EOF) c = fgetc(fp); + if (c == EOF) { + scanres = EOF; + goto badread; + } + *q = 0; + if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space + else p->anno = strdup(""); + // read the rest + scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + if (scanres != 3) goto badread; + p->offset = xx; + } + err_fclose(fp); + } + { // read .amb + int64_t l_pac; + int32_t n_seqs; + fp = xopen(fname = amb_filename, "r"); + scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + if (scanres != 3) goto badread; + l_pac = xx; + xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); + bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; + for (i = 0; i < bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); + if (scanres != 3) goto badread; + p->offset = xx; + p->amb = str[0]; + } + err_fclose(fp); + } + { // open .pac + bns->fp_pac = xopen(pac_filename, "rb"); + } + return bns; + + badread: + if (EOF == scanres) { + err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file"); + } + err_fatal(__func__, "Parse error reading %s\n", fname); +} + +bntseq_t *bns_restore(const char *prefix) +{ + char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024]; + FILE *fp; + bntseq_t *bns; + strcat(strcpy(ann_filename, prefix), ".ann"); + strcat(strcpy(amb_filename, prefix), ".amb"); + strcat(strcpy(pac_filename, prefix), ".pac"); + bns = bns_restore_core(ann_filename, amb_filename, pac_filename); + if (bns == 0) return 0; + if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present + char str[1024]; + khash_t(str) *h; + int c, i, absent; + khint_t k; + h = kh_init(str); + for (i = 0; i < bns->n_seqs; ++i) { + k = kh_put(str, h, bns->anns[i].name, &absent); + kh_val(h, k) = i; + } + i = 0; + while ((c = fgetc(fp)) != EOF) { + if (c == '\t' || c == '\n' || c == '\r') { + str[i] = 0; + if (str[0] != '@') { + k = kh_get(str, h, str); + if (k != kh_end(h)) + bns->anns[kh_val(h, k)].is_alt = 1; + } + while (c != '\n' && c != EOF) c = fgetc(fp); + i = 0; + } else str[i++] = c; // FIXME: potential segfault here + } + kh_destroy(str, h); + fclose(fp); + } + return bns; +} + +void bns_destroy(bntseq_t *bns) +{ + if (bns == 0) return; + else { + int i; + if (bns->fp_pac) err_fclose(bns->fp_pac); + free(bns->ambs); + for (i = 0; i < bns->n_seqs; ++i) { + free(bns->anns[i].name); + free(bns->anns[i].anno); + } + free(bns->anns); + free(bns); + } +} + +#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) +#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) + +static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) +{ + bntann1_t *p; + int i, lasts; + if (bns->n_seqs == *m_seqs) { + *m_seqs <<= 1; + bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); + } + p = bns->anns + bns->n_seqs; + p->name = strdup((char*)seq->name.s); + p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); + p->gi = 0; p->len = seq->seq.l; + p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; + p->n_ambs = 0; + for (i = lasts = 0; i < seq->seq.l; ++i) { + int c = nst_nt4_table[(int)seq->seq.s[i]]; + if (c >= 4) { // N + if (lasts == seq->seq.s[i]) { // contiguous N + ++(*q)->len; + } else { + if (bns->n_holes == *m_holes) { + (*m_holes) <<= 1; + bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); + } + *q = bns->ambs + bns->n_holes; + (*q)->len = 1; + (*q)->offset = p->offset + i; + (*q)->amb = seq->seq.s[i]; + ++p->n_ambs; + ++bns->n_holes; + } + } + lasts = seq->seq.s[i]; + { // fill buffer + if (c >= 4) c = lrand48()&3; + if (bns->l_pac == *m_pac) { // double the pac size + *m_pac <<= 1; + pac = realloc(pac, *m_pac/4); + memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); + } + _set_pac(pac, bns->l_pac, c); + ++bns->l_pac; + } + } + ++bns->n_seqs; + return pac; +} + +int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) +{ + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c + kseq_t *seq; + char name[1024]; + bntseq_t *bns; + uint8_t *pac = 0; + int32_t m_seqs, m_holes; + int64_t ret = -1, m_pac, l; + bntamb1_t *q; + FILE *fp; + + // initialization + seq = kseq_init(fp_fa); + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + bns->seed = 11; // fixed seed for random generator + srand48(bns->seed); + m_seqs = m_holes = 8; m_pac = 0x10000; + bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); + bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); + pac = calloc(m_pac/4, 1); + q = bns->ambs; + strcpy(name, prefix); strcat(name, ".pac"); + fp = xopen(name, "wb"); + // read sequences + while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); + if (!for_only) { // add the reverse complemented sequence + m_pac = (bns->l_pac * 2 + 3) / 4 * 4; + pac = realloc(pac, m_pac/4); + memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); + for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) + _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); + } + ret = bns->l_pac; + { // finalize .pac file + ubyte_t ct; + err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); + // the following codes make the pac file size always (l_pac/4+1+1) + if (bns->l_pac % 4 == 0) { + ct = 0; + err_fwrite(&ct, 1, 1, fp); + } + ct = bns->l_pac % 4; + err_fwrite(&ct, 1, 1, fp); + // close .pac file + err_fflush(fp); + err_fclose(fp); + } + bns_dump(bns, prefix); + bns_destroy(bns); + kseq_destroy(seq); + free(pac); + return ret; +} + +int bwa_fa2pac(int argc, char *argv[]) +{ + int c, for_only = 0; + gzFile fp; + while ((c = getopt(argc, argv, "f")) >= 0) { + switch (c) { + case 'f': for_only = 1; break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: bwa fa2pac [-f] []\n"); + return 1; + } + fp = xzopen(argv[optind], "r"); + bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); + err_gzclose(fp); + return 0; +} + +int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) +{ + int left, mid, right; + if (pos_f >= bns->l_pac) return -1; + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { // binary search + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + return mid; +} + +int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) +{ + int is_rev, rid_b, rid_e; + if (rb < bns->l_pac && re > bns->l_pac) return -2; + assert(rb <= re); + rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); + rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b; + return rid_b == rid_e? rid_b : -1; +} + +int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) +{ + int left, mid, right, nn; + if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); + left = 0; right = bns->n_holes; nn = 0; + while (left < right) { + mid = (left + right) >> 1; + if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; + else if (pos_f + len <= bns->ambs[mid].offset) right = mid; + else { // overlap + if (pos_f >= bns->ambs[mid].offset) { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; + } else { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); + } + break; + } + } + return nn; +} + +uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) +{ + uint8_t *seq = 0; + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap + if (end > l_pac<<1) end = l_pac<<1; + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + seq = malloc(end - beg); + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing + return seq; +} + +uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) +{ + int64_t far_beg, far_end, len; + int is_rev; + uint8_t *seq; + + if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac<<1) - far_end; + far_end = (bns->l_pac<<1) - tmp; + } + *beg = *beg > far_beg? *beg : far_beg; + *end = *end < far_end? *end : far_end; + seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); + if (seq == 0 || *end - *beg != len) { + fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", + __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); + } + assert(seq && *end - *beg == len); // assertion failure should never happen + return seq; +} diff --git a/src/bwa/bntseq.h b/src/bwa/bntseq.h new file mode 100644 index 000000000..03671d68e --- /dev/null +++ b/src/bwa/bntseq.h @@ -0,0 +1,92 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWT_BNTSEQ_H +#define BWT_BNTSEQ_H + +#include +#include +#include +#include + +#ifndef BWA_UBYTE +#define BWA_UBYTE +typedef uint8_t ubyte_t; +#endif + +typedef struct { + int64_t offset; + int32_t len; + int32_t n_ambs; + uint32_t gi; + int32_t is_alt; + char *name, *anno; +} bntann1_t; + +typedef struct { + int64_t offset; + int32_t len; + char amb; +} bntamb1_t; + +typedef struct { + int64_t l_pac; + int32_t n_seqs; + uint32_t seed; + bntann1_t *anns; // n_seqs elements + int32_t n_holes; + bntamb1_t *ambs; // n_holes elements + FILE *fp_pac; +} bntseq_t; + +extern unsigned char nst_nt4_table[256]; + +#ifdef __cplusplus +extern "C" { +#endif + + void bns_dump(const bntseq_t *bns, const char *prefix); + bntseq_t *bns_restore(const char *prefix); + bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); + void bns_destroy(bntseq_t *bns); + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); + int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); + +#ifdef __cplusplus +} +#endif + +static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) +{ + return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos; +} + +#endif diff --git a/src/bwa/bwa.c b/src/bwa/bwa.c new file mode 100644 index 000000000..f9ce6a17f --- /dev/null +++ b/src/bwa/bwa.c @@ -0,0 +1,447 @@ +#include +#include +#include +#include +#include "bntseq.h" +#include "bwa.h" +#include "ksw.h" +#include "utils.h" +#include "kstring.h" +#include "kvec.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +int bwa_verbose = 3; +char bwa_rg_id[256]; +char *bwa_pg; + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(ks->comment.s) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + seqs[n].id = n; + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + seqs[n].id = n; + size += seqs[n++].l_seq; + } + if (size >= chunk_size && (n&1) == 0) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} + +void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]) +{ + int i, has_last; + kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}}; + for (i = 1, has_last = 1; i < n; ++i) { + if (has_last) { + if (strcmp(seqs[i].name, seqs[i-1].name) == 0) { + kv_push(bseq1_t, a[1], seqs[i-1]); + kv_push(bseq1_t, a[1], seqs[i]); + has_last = 0; + } else kv_push(bseq1_t, a[0], seqs[i-1]); + } else has_last = 1; + } + if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]); + sep[0] = a[0].a, m[0] = a[0].n; + sep[1] = a[1].a, m[1] = a[1].n; +} + +/***************** + * CIGAR related * + *****************/ + +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = -1; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = -1; +} + +// Generate CIGAR when the alignment end points are known +uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i; + int64_t rlen; + kstring_t str; + const char *int2base; + + if (n_cigar) *n_cigar = 0; + if (NM) *NM = -1; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; + } + if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP + // UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance. + if (n_cigar) { + cigar = malloc(4); + cigar[0] = l_query<<4 | 0; + *n_cigar = 1; + } + for (i = 0, *score = 0; i < l_query; ++i) + *score += mat[rseq[i]*5 + query[i]]; + } else { + int w, max_gap, max_ins, max_del, min_w; + // set the band-width + max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.); + max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.); + max_gap = max_ins > max_del? max_ins : max_del; + max_gap = max_gap > 1? max_gap : 1; + w = (max_gap + abs(rlen - l_query) + 1) >> 1; + w = w < w_? w : w_; + min_w = abs(rlen - l_query) + 3; + w = w > min_w? w : min_w; + // NW alignment + if (bwa_verbose >= 4) { + printf("* Global bandwidth: %d\n", w); + printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + } + *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar); + } + if (NM && n_cigar) {// compute NM and MD + int k, x, y, u, n_mm = 0, n_gap = 0; + str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR + int2base = rb < l_pac? "ACGTN" : "TGCAN"; + for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { + int op, len; + cigar = (uint32_t*)str.s; + op = cigar[k]&0xf, len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) { + if (query[x + i] != rseq[y + i]) { + kputw(u, &str); + kputc(int2base[rseq[y+i]], &str); + ++n_mm; u = 0; + } else ++u; + } + x += len; y += len; + } else if (op == 2) { // deletion + if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR + kputw(u, &str); kputc('^', &str); + for (i = 0; i < len; ++i) + kputc(int2base[rseq[y+i]], &str); + u = 0; n_gap += len; + } + y += len; + } else if (op == 1) x += len, n_gap += len; // insertion + } + kputw(u, &str); kputc(0, &str); + *NM = n_mm + n_gap; + cigar = (uint32_t*)str.s; + } + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} + +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +{ + return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); +} + +/********************* + * Full index reader * + *********************/ + +char *bwa_idx_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = malloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + fclose(fp); + prefix[l_hint + 3] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + fclose(fp); + prefix[l_hint] = 0; + return prefix; + } + } +} + +bwt_t *bwa_idx_load_bwt(const char *hint) +{ + char *tmp, *prefix; + bwt_t *bwt; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, bwt); + free(tmp); free(prefix); + return bwt; +} + +bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) +{ + bwaidx_t *idx; + char *prefix; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + idx = calloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); + if (which & BWA_IDX_BNS) { + int i, c; + idx->bns = bns_restore(prefix); + for (i = c = 0; i < idx->bns->n_seqs; ++i) + if (idx->bns->anns[i].is_alt) ++c; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c); + if (which & BWA_IDX_PAC) { + idx->pac = calloc(idx->bns->l_pac/4+1, 1); + err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + err_fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; + } + } + free(prefix); + return idx; +} + +bwaidx_t *bwa_idx_load(const char *hint, int which) +{ + return bwa_idx_load_from_disk(hint, which); +} + +void bwa_idx_destroy(bwaidx_t *idx) +{ + if (idx == 0) return; + if (idx->mem == 0) { + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + } else { + free(idx->bwt); free(idx->bns->anns); free(idx->bns); + if (!idx->is_shm) free(idx->mem); + } + free(idx); +} + +int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) +{ + int64_t k = 0, x; + int i; + + // generate idx->bwt + x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x; + x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; + + // generate idx->bns and idx->pac + x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x; + x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1; + idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1; + } + idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1; + assert(k == l_mem); + + idx->l_mem = k; idx->mem = mem; + return 0; +} + +int bwa_idx2mem(bwaidx_t *idx) +{ + int i; + int64_t k, x, tmp; + uint8_t *mem; + + // copy idx->bwt + x = idx->bwt->bwt_size * 4; + mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0; + memmove(mem + sizeof(bwt_t), mem, x); + memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x; + x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x; + free(idx->bwt->sa); + free(idx->bwt); idx->bwt = 0; + + // copy idx->bns + tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t); + for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory + tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2; + mem = realloc(mem, k + sizeof(bntseq_t) + tmp); + x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x; + free(idx->bns->ambs); + x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x; + x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x; + free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno); + } + free(idx->bns->anns); + + // copy idx->pac + x = idx->bns->l_pac/4+1; + mem = realloc(mem, k + x); + memcpy(mem + k, idx->pac, x); k += x; + free(idx->bns); idx->bns = 0; + free(idx->pac); idx->pac = 0; + + return bwa_mem2idx(k, mem, idx); +} + +/*********************** + * SAM header routines * + ***********************/ + +void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line) +{ + int i, n_SQ = 0; + extern char *bwa_pg; + if (hdr_line) { + const char *p = hdr_line; + while ((p = strstr(p, "@SQ\t")) != 0) { + if (p == hdr_line || *(p-1) == '\n') ++n_SQ; + p += 4; + } + } + if (n_SQ == 0) { + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + } else if (n_SQ != bns->n_seqs && bwa_verbose >= 2) + fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs); + if (hdr_line) err_printf("%s\n", hdr_line); + if (bwa_pg) err_printf("%s\n", bwa_pg); +} + +static char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +char *bwa_set_rg(const char *s) +{ + char *p, *q, *r, *rg_line = 0; + memset(bwa_rg_id, 0, 256); + if (strstr(s, "@RG") != s) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + goto err_set_rg; + } + rg_line = strdup(s); + bwa_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return rg_line; + +err_set_rg: + free(rg_line); + return 0; +} + +char *bwa_insert_header(const char *s, char *hdr) +{ + int len = 0; + if (s == 0 || s[0] != '@') return hdr; + if (hdr) { + len = strlen(hdr); + hdr = realloc(hdr, len + strlen(s) + 2); + hdr[len++] = '\n'; + strcpy(hdr + len, s); + } else hdr = strdup(s); + bwa_escape(hdr + len); + return hdr; +} diff --git a/src/bwa/bwa.h b/src/bwa/bwa.h new file mode 100644 index 000000000..1541c1c2a --- /dev/null +++ b/src/bwa/bwa.h @@ -0,0 +1,62 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include +#include "bntseq.h" +#include "bwt.h" + +#define BWA_IDX_BWT 0x1 +#define BWA_IDX_BNS 0x2 +#define BWA_IDX_PAC 0x4 +#define BWA_IDX_ALL 0x7 + +#define BWA_CTL_SIZE 0x10000 + +typedef struct { + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + + int is_shm; + int64_t l_mem; + uint8_t *mem; +} bwaidx_t; + +typedef struct { + int l_seq, id; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + +extern int bwa_verbose; +extern char bwa_rg_id[256]; + +#ifdef __cplusplus +extern "C" { +#endif + + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); + + void bwa_fill_scmat(int a, int b, int8_t mat[25]); + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + + char *bwa_idx_infer_prefix(const char *hint); + bwt_t *bwa_idx_load_bwt(const char *hint); + + bwaidx_t *bwa_idx_load_from_shm(const char *hint); + bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which); + bwaidx_t *bwa_idx_load(const char *hint, int which); + void bwa_idx_destroy(bwaidx_t *idx); + int bwa_idx2mem(bwaidx_t *idx); + int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx); + + void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line); + char *bwa_set_rg(const char *s); + char *bwa_insert_header(const char *s, char *hdr); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bwamem.c b/src/bwa/bwamem.c new file mode 100644 index 000000000..438665323 --- /dev/null +++ b/src/bwa/bwamem.c @@ -0,0 +1,1229 @@ +#include +#include +#include +#include +#include +#include +#ifdef HAVE_PTHREAD +#include +#endif + +#include "kstring.h" +#include "bwamem.h" +#include "bntseq.h" +#include "ksw.h" +#include "kvec.h" +#include "ksort.h" +#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +/* Theory on probability and scoring *ungapped* alignment + * + * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution + * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate + * + * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x + * + * If the matching score is x and mismatch penalty is -y, we can compute error rate e: + * e = .75 * exp[-log(4) * y/x] + * + * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} + * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) + * + * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: + * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) + * + * + * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) + * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + * + * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) + */ + +static const bntseq_t *global_bns = 0; // for debugging only + +mem_opt_t *mem_opt_init() +{ + mem_opt_t *o; + o = calloc(1, sizeof(mem_opt_t)); + o->flag = 0; + o->a = 1; o->b = 4; + o->o_del = o->o_ins = 6; + o->e_del = o->e_ins = 1; + o->w = 100; + o->T = 30; + o->zdrop = 100; + o->pen_unpaired = 17; + o->pen_clip5 = o->pen_clip3 = 5; + + o->max_mem_intv = 20; + + o->min_seed_len = 19; + o->split_width = 10; + o->max_occ = 500; + o->max_chain_gap = 10000; + o->max_ins = 10000; + o->mask_level = 0.50; + o->drop_ratio = 0.50; + o->XA_drop_ratio = 0.80; + o->split_factor = 1.5; + o->chunk_size = 10000000; + o->n_threads = 1; + o->max_XA_hits = 5; + o->max_XA_hits_alt = 200; + o->max_matesw = 50; + o->mask_level_redun = 0.95; + o->min_chain_weight = 0; + o->max_chain_extend = 1<<30; + o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); + bwa_fill_scmat(o->a, o->b, o->mat); + return o; +} + +/*************************** + * Collection SA invervals * + ***************************/ + +#define intv_lt(a, b) ((a).info < (b).info) +KSORT_INIT(mem_intv, bwtintv_t, intv_lt) + +typedef struct { + bwtintv_v mem, mem1, *tmpv[2]; +} smem_aux_t; + +static smem_aux_t *smem_aux_init() +{ + smem_aux_t *a; + a = calloc(1, sizeof(smem_aux_t)); + a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); + a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); + return a; +} + +static void smem_aux_destroy(smem_aux_t *a) +{ + free(a->tmpv[0]->a); free(a->tmpv[0]); + free(a->tmpv[1]->a); free(a->tmpv[1]); + free(a->mem.a); free(a->mem1.a); + free(a); +} + +static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) +{ + int i, k, x = 0, old_n; + int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1; + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + a->mem.n = 0; + + // first pass: find all SMEMs + while (x < len) { + if (seq[x] < 4) { + x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) { + bwtintv_t *p = &a->mem1.a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + if (slen >= opt->min_seed_len) + kv_push(bwtintv_t, a->mem, *p); + } + } else ++x; + } + // second pass: find MEMs inside a long SMEM + old_n = a->mem.n; + for (k = 0; k < old_n; ++k) { + bwtintv_t *p = &a->mem.a[k]; + int start = p->info>>32, end = (int32_t)p->info; + if (end - start < split_len || p->x[2] > opt->split_width) continue; + bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) + if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } + // third pass: LAST-like + if (opt->max_mem_intv > 0) { + x = 0; + while (x < len) { + if (seq[x] < 4) { + if (1) { + bwtintv_t m; + x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); + if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); + } else { // for now, we never come to this block which is slower + x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } + } else ++x; + } + } + // sort + ks_introsort(mem_intv, a->mem.n, a->mem.a); +} + +/************ + * Chaining * + ************/ + +typedef struct { + int64_t rbeg; + int32_t qbeg, len; + int score; +} mem_seed_t; // unaligned memory + +typedef struct { + int n, m, first, rid; + uint32_t w:29, kept:2, is_alt:1; + float frac_rep; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + +#include "kbtree.h" + +#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) +KBTREE_INIT(chn, mem_chain_t, chain_cmp) + +// return 1 if the seed is merged into the chain +static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) +{ + int64_t qend, rend, x, y; + const mem_seed_t *last = &c->seeds[c->n-1]; + qend = last->qbeg + last->len; + rend = last->rbeg + last->len; + if (seed_rid != c->rid) return 0; // different chr; request a new chain + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) + return 1; // contained seed; do nothing + if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand + x = p->qbeg - last->qbeg; // always non-negtive + y = p->rbeg - last->rbeg; + if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain + if (c->n == c->m) { + c->m <<= 1; + c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); + } + c->seeds[c->n++] = *p; + return 1; + } + return 0; // request to add a new chain +} + +int mem_chain_weight(const mem_chain_t *c) +{ + int64_t end; + int j, w = 0, tmp; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->qbeg >= end) w += s->len; + else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + tmp = w; w = 0; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->rbeg >= end) w += s->len; + else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; + end = end > s->rbeg + s->len? end : s->rbeg + s->len; + } + w = w < tmp? w : tmp; + return w < 1<<30? w : (1<<30)-1; +} + +void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) +{ + int i, j; + for (i = 0; i < chn->n; ++i) { + mem_chain_t *p = &chn->a[i]; + err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p)); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + err_printf("\t%d;%d;%d,%ld(%s:%c%ld)", p->seeds[j].score, p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[p->rid].name, "+-"[is_rev], (long)(pos - bns->anns[p->rid].offset) + 1); + } + err_putchar('\n'); + } +} + +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) +{ + int i, b, e, l_rep; + int64_t l_pac = bns->l_pac; + mem_chain_v chain; + kbtree_t(chn) *tree; + smem_aux_t *aux; + + kv_init(chain); + if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match + tree = kb_init(chn, KB_DEFAULT_SIZE); + + aux = buf? (smem_aux_t*)buf : smem_aux_init(); + mem_collect_intv(opt, bwt, len, seq, aux); + for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep + bwtintv_t *p = &aux->mem.a[i]; + int sb = (p->info>>32), se = (uint32_t)p->info; + if (p->x[2] <= opt->max_occ) continue; + if (sb > e) l_rep += e - b, b = sb, e = se; + else e = e > se? e : se; + } + l_rep += e - b; + for (i = 0; i < aux->mem.n; ++i) { + bwtintv_t *p = &aux->mem.a[i]; + int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive + step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; + for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { + mem_chain_t tmp, *lower, *upper; + mem_seed_t s; + int rid, to_add = 0; + s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference + s.qbeg = p->info>>32; + s.score= s.len = slen; + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!! + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; + } else to_add = 1; + if (to_add) { // add the seed as a new chain + tmp.n = 1; tmp.m = 4; + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds[0] = s; + tmp.rid = rid; + tmp.is_alt = !!bns->anns[rid].is_alt; + kb_putp(chn, tree, &tmp); + } + } + } + if (buf == 0) smem_aux_destroy(aux); + + kv_resize(mem_chain_t, chain, kb_size(tree)); + + #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); + #undef traverse_func + + for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; + if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); + + kb_destroy(chn, tree); + return chain; +} + +/******************** + * Filtering chains * + ********************/ + +#define chn_beg(ch) ((ch).seeds->qbeg) +#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len) + +#define flt_lt(a, b) ((a).w > (b).w) +KSORT_INIT(mem_flt, mem_chain_t, flt_lt) + +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) +{ + int i, k; + kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains + if (n_chn == 0) return 0; // no need to filter + // compute the weight of each chain and drop chains with small weight + for (i = k = 0; i < n_chn; ++i) { + mem_chain_t *c = &a[i]; + c->first = -1; c->kept = 0; + c->w = mem_chain_weight(c); + if (c->w < opt->min_chain_weight) free(c->seeds); + else a[k++] = *c; + } + n_chn = k; + ks_introsort(mem_flt, n_chn, a); + // pairwise chain comparisons + a[0].kept = 3; + kv_push(int, chains, 0); + for (i = 1; i < n_chn; ++i) { + int large_ovlp = 0; + for (k = 0; k < chains.n; ++k) { + int j = chains.a[k]; + int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); + int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); + if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary + int li = chn_end(a[i]) - chn_beg(a[i]); + int lj = chn_end(a[j]) - chn_beg(a[j]); + int min_l = li < lj? li : lj; + if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap + large_ovlp = 1; + if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate + if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) + break; + } + } + } + if (k == chains.n) { + kv_push(int, chains, i); + a[i].kept = large_ovlp? 2 : 3; + } + } + for (i = 0; i < chains.n; ++i) { + mem_chain_t *c = &a[chains.a[i]]; + if (c->first >= 0) a[c->first].kept = 1; + } + free(chains.a); + for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains + if (a[i].kept == 0 || a[i].kept == 3) continue; + if (++k >= opt->max_chain_extend) break; + } + for (; i < n_chn; ++i) + if (a[i].kept < 3) a[i].kept = 0; + for (i = k = 0; i < n_chn; ++i) { // free discarded chains + mem_chain_t *c = &a[i]; + if (c->kept == 0) free(c->seeds); + else a[k++] = a[i]; + } + return k; +} + +/****************************** + * De-overlap single-end hits * + ******************************/ + +#define alnreg_slt2(a, b) ((a).re < (b).re) +KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) + +#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) +KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) + +#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash)))) +KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) + +#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)))) +KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2) + +#define PATCH_MAX_R_BW 0.05f +#define PATCH_MIN_SC_RATIO 0.90f + +int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *a, const mem_alnreg_t *b, int *_w) +{ + int w, score, q_s, r_s; + double r; + if (bns == 0 || pac == 0 || query == 0) return 0; + assert(a->rid == b->rid && a->rb <= b->rb); + if (a->rb < bns->l_pac && b->rb >= bns->l_pac) return 0; // on different strands + if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear + w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth + w = w > 0? w : -w; // l = abs(l) + r = (double)(a->re - b->rb) / (b->re - a->rb) - (double)(a->qe - b->qb) / (b->qe - a->qb); // relative bandwidth + r = r > 0.? r : -r; // r = fabs(r) + if (bwa_verbose >= 4) + printf("* potential hit merge between [%d,%d)<=>[%ld,%ld) and [%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n", + a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r); + if (a->re < b->rb || a->qe < b->qb) { // no overlap on query or on ref + if (w > opt->w<<1 || r >= PATCH_MAX_R_BW) return 0; // the bandwidth or the relative bandwidth is too large + } else if (w > opt->w<<2 || r >= PATCH_MAX_R_BW*2) return 0; // more permissive if overlapping on both ref and query + // global alignment + w += a->w + b->w; + w = w < opt->w<<2? w : opt->w<<2; + if (bwa_verbose >= 4) printf("* test potential hit merge with global alignment; w=%d\n", w); + bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, &score, 0, 0); + q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * (b->score + a->score) + .499); // predicted score from query + r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * (b->score + a->score) + .499); // predicted score from ref + if (bwa_verbose >= 4) printf("* score=%d;(%d,%d)\n", score, q_s, r_s); + if ((double)score / (q_s > r_s? q_s : r_s) < PATCH_MIN_SC_RATIO) return 0; + *_w = w; + return score; +} + +int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a) +{ + int m, i, j; + if (n <= 1) return n; + ks_introsort(mem_ars2, n, a); // sort by the END position, not START! + for (i = 0; i < n; ++i) a[i].n_comp = 1; + for (i = 1; i < n; ++i) { + mem_alnreg_t *p = &a[i]; + if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) continue; // then no need to go into the loop below + for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) { + mem_alnreg_t *q = &a[j]; + int64_t or, oq, mr, mq; + int score, w; + if (q->qe == q->qb) continue; // a[j] has been excluded + or = q->re - p->rb; // overlap length on the reference + oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query + mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment + mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment + if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant + if (p->score < q->score) { + p->qe = p->qb; + break; + } else q->qe = q->qb; + } else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p + p->n_comp += q->n_comp + 1; + p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov; + p->sub = p->sub > q->sub? p->sub : q->sub; + p->csub = p->csub > q->csub? p->csub : q->csub; + p->qb = q->qb, p->rb = q->rb; + p->truesc = p->score = score; + p->w = w; + q->qb = q->qe; + } + } + } + for (i = 0, m = 0; i < n; ++i) // exclude identical hits + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } + n = m; + ks_introsort(mem_ars, n, a); + for (i = 1; i < n; ++i) { // mark identical hits + if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) + a[i].qe = a[i].qb; + } + for (i = 1, m = 1; i < n; ++i) // exclude identical hits + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } + return m; +} + +int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) +{ + if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n; + memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); + return n - 1; +} + +typedef kvec_t(int) int_v; + +static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z) +{ // similar to the loop in mem_chain_flt() + int i, k, tmp; + tmp = opt->a + opt->b; + tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; + tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; + z->n = 0; + kv_push(int, *z, 0); + for (i = 1; i < n; ++i) { + for (k = 0; k < z->n; ++k) { + int j = z->a[k]; + int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; + int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; + if (e_min > b_max) { // have overlap + int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].sub == 0) a[j].sub = a[i].score; + if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt)) + ++a[j].sub_n; + break; + } + } + } + if (k == z->n) kv_push(int, *z, i); + else a[i].secondary = z->a[k]; + } +} + +int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) +{ + int i, n_pri; + int_v z = {0,0,0}; + if (n == 0) return 0; + for (i = n_pri = 0; i < n; ++i) { + a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i); + if (!a[i].is_alt) ++n_pri; + } + ks_introsort(mem_ars_hash, n, a); + mem_mark_primary_se_core(opt, n, a, &z); + for (i = 0; i < n; ++i) { + mem_alnreg_t *p = &a[i]; + p->secondary_all = i; // keep the rank in the first round + if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) + p->alt_sc = a[p->secondary].score; + } + if (n_pri >= 0 && n_pri < n) { + kv_resize(int, z, n); + if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); + for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i; + for (i = 0; i < n; ++i) { + if (a[i].secondary >= 0) { + a[i].secondary_all = z.a[a[i].secondary]; + if (a[i].is_alt) a[i].secondary = INT_MAX; + } else a[i].secondary_all = -1; + } + if (n_pri > 0) { // mark primary for hits to the primary assembly only + for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; + mem_mark_primary_se_core(opt, n_pri, a, &z); + } + } else { + for (i = 0; i < n; ++i) + a[i].secondary_all = a[i].secondary; + } + free(z.a); + return n_pri; +} + +/********************************* + * Test if a seed is good enough * + *********************************/ + +#define MEM_SHORT_EXT 50 +#define MEM_SHORT_LEN 200 + +#define MEM_HSP_COEF 1.1f +#define MEM_MINSC_COEF 5.5f +#define MEM_SEEDSW_COEF 0.05f + +int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s) +{ + int qb, qe, rid; + int64_t rb, re, mid, l_pac = bns->l_pac; + uint8_t *rseq = 0; + kswr_t x; + + if (s->len >= MEM_SHORT_LEN) return -1; // the seed is longer than the max-extend; no need to do SW + qb = s->qbeg, qe = s->qbeg + s->len; + rb = s->rbeg, re = s->rbeg + s->len; + mid = (rb + re) >> 1; + qb -= MEM_SHORT_EXT; qb = qb > 0? qb : 0; + qe += MEM_SHORT_EXT; qe = qe < l_query? qe : l_query; + rb -= MEM_SHORT_EXT; rb = rb > 0? rb : 0; + re += MEM_SHORT_EXT; re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (mid < l_pac) re = l_pac; + else rb = l_pac; + } + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; // the seed seems good enough; no need to do SW + + rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid); + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); + free(rseq); + return x.score; +} + +void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a) +{ + double min_l = opt->min_chain_weight? MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query); + int i, j, k, min_HSP_score = (int)(opt->a * min_l + .499); + if (min_l > MEM_SEEDSW_COEF * l_query) return; // don't run the following for short reads + for (i = 0; i < n_chn; ++i) { + mem_chain_t *c = &a[i]; + for (j = k = 0; j < c->n; ++j) { + mem_seed_t *s = &c->seeds[j]; + s->score = mem_seed_sw(opt, bns, pac, l_query, query, s); + if (s->score < 0 || s->score >= min_HSP_score) { + s->score = s->score < 0? s->len * opt->a : s->score; + c->seeds[k++] = *s; + } + } + c->n = k; + } +} + +/**************************************** + * Construct the alignment from a chain * + ****************************************/ + +static inline int cal_max_gap(const mem_opt_t *opt, int qlen) +{ + int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.); + int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.); + int l = l_del > l_ins? l_del : l_ins; + l = l > 1? l : 1; + return l < opt->w<<1? l : opt->w<<1; +} + +#define MAX_BAND_TRY 2 + +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +{ + int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension + int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; + const mem_seed_t *s; + uint8_t *rseq = 0; + uint64_t *srt; + + if (c->n == 0) return; + // get the max possible span + rmax[0] = l_pac<<1; rmax[1] = 0; + for (i = 0; i < c->n; ++i) { + int64_t b, e; + const mem_seed_t *t = &c->seeds[i]; + b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); + e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); + rmax[0] = rmax[0] < b? rmax[0] : b; + rmax[1] = rmax[1] > e? rmax[1] : e; + if (t->len > max) max = t->len; + } + rmax[0] = rmax[0] > 0? rmax[0] : 0; + rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; + if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side + if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand + else rmax[0] = l_pac; + } + // retrieve the reference sequence + rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); + assert(c->rid == rid); + + srt = malloc(c->n * 8); + for (i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].score<<32 | i; + ks_introsort_64(c->n, srt); + + for (k = c->n - 1; k >= 0; --k) { + mem_alnreg_t *a; + s = &c->seeds[(uint32_t)srt[k]]; + + for (i = 0; i < av->n; ++i) { // test whether extension has been made before + mem_alnreg_t *p = &av->a[i]; + int64_t rd; + int qd, w, max_gap; + if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained + if (s->len - p->seedlen0 > .1 * l_query) continue; // this seed may give a better alignment + // qd: distance ahead of the seed on query; rd: on reference + qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; + max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed + w = max_gap < p->w? max_gap : p->w; // bounded by the band width + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + // similar to the previous four lines, but this time we look at the region behind + qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); + max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < p->w? max_gap : p->w; + if (qd - rd < w && rd - qd < w) break; + } + if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln + if (bwa_verbose >= 4) + printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment [%d,%d) <=> [%ld,%ld)\n", + k, (long)s->len, (long)s->qbeg, (long)s->rbeg, av->a[i].qb, av->a[i].qe, (long)av->a[i].rb, (long)av->a[i].re); + for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain + const mem_seed_t *t; + if (srt[i] == 0) continue; + t = &c->seeds[(uint32_t)srt[i]]; + if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping + if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; + } + if (i == c->n) { // no overlapping seeds; then skip extension + srt[k] = 0; // mark that seed extension has not been performed + continue; + } + if (bwa_verbose >= 4) + printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k); + } + + a = kv_pushp(mem_alnreg_t, *av); + memset(a, 0, sizeof(mem_alnreg_t)); + a->w = aw[0] = aw[1] = opt->w; + a->score = a->truesc = -1; + a->rid = c->rid; + + if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name); + if (s->qbeg) { // left extension + uint8_t *rs, *qs; + int qle, tle, gtle, gscore; + qs = malloc(s->qbeg); + for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; + tmp = s->rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[0] = opt->w << i; + if (bwa_verbose >= 4) { + int j; + printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); + printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); + } + a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } + if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; + } + // check whether we prefer to reach the end of the query + if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension + a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; + a->truesc = a->score; + } else { // to-end extension + a->qb = 0, a->rb = s->rbeg - gtle; + a->truesc = gscore; + } + free(qs); free(rs); + } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; + + if (s->qbeg + s->len != l_query) { // right extension + int qle, tle, qe, re, gtle, gscore, sc0 = a->score; + qe = s->qbeg + s->len; + re = s->rbeg + s->len - rmax[0]; + assert(re >= 0); + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[1] = opt->w << i; + if (bwa_verbose >= 4) { + int j; + printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); + printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); + } + a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } + if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; + } + // similar to the above + if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension + a->qe = qe + qle, a->re = rmax[0] + re + tle; + a->truesc += a->score - sc0; + } else { // to-end extension + a->qe = l_query, a->re = rmax[0] + re + gtle; + a->truesc += gscore - sc0; + } + } else a->qe = l_query, a->re = s->rbeg + s->len; + if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]); + + // compute seedcov + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained + a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + a->w = aw[0] > aw[1]? aw[0] : aw[1]; + a->seedlen0 = s->len; + + a->frac_rep = c->frac_rep; + } + free(srt); free(rseq); +} + +/***************************** + * Basic hit->SAM conversion * + *****************************/ + +static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) +{ + int w; + if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps + w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.); + if (w < abs(l1 - l2)) w = abs(l1 - l2); + return w; +} + +static inline int get_rlen(int n_cigar, const uint32_t *cigar) +{ + int k, l; + for (k = l = 0; k < n_cigar; ++k) { + int op = cigar[k]&0xf; + if (op == 0 || op == 2) + l += cigar[k]>>4; + } + return l; +} + +void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) +{ + int i, l_name; + mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert + + if (m_) mtmp = *m_, m = &mtmp; + // set flag + p->flag |= m? 0x1 : 0; // is paired in sequencing + p->flag |= p->rid < 0? 0x4 : 0; // is mapped + p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0; + if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate + m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0; + p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand + p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand + + // print up to CIGAR + l_name = strlen(s->name); + ks_resize(str, str->l + s->l_seq + l_name + (s->qual? s->l_seq : 0) + 20); + kputsn(s->name, l_name, str); kputc('\t', str); // QNAME + kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG + if (p->rid >= 0) { // with coordinate + kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME + kputl(p->pos + 1, str); kputc('\t', str); // POS + kputw(p->mapq, str); kputc('\t', str); // MAPQ + if (p->n_cigar) { // aligned + for (i = 0; i < p->n_cigar; ++i) { + int c = p->cigar[i]&0xf; + if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4)) + c = which? 4 : 3; // use hard clipping for supplementary alignments + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); + } + } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) + } else kputsn("*\t0\t0\t*", 7, str); // without coordinte + kputc('\t', str); + + // print the mate position if applicable + if (m && m->rid >= 0) { + if (p->rid == m->rid) kputc('=', str); + else kputs(bns->anns[m->rid].name, str); + kputc('\t', str); + kputl(m->pos + 1, str); kputc('\t', str); + if (p->rid == m->rid) { + int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0); + int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0); + if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str); + else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + } else kputc('0', str); + } else kputsn("*\t0\t0", 5, str); + kputc('\t', str); + + // print SEQ and QUAL + if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!p->is_rev) { // the forward strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all + if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { + if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + + // print optional tags + if (p->n_cigar) { + kputsn("\tNM:i:", 6, str); kputw(p->NM, str); + kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); + } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (!(p->flag & 0x100)) { // not multi-hit + for (i = 0; i < n; ++i) + if (i != which && !(list[i].flag&0x100)) break; + if (i < n) { // there are other primary hits; output them + kputsn("\tSA:Z:", 6, str); + for (i = 0; i < n; ++i) { + const mem_aln_t *r = &list[i]; + int k; + if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit + kputs(bns->anns[r->rid].name, str); kputc(',', str); + kputl(r->pos+1, str); kputc(',', str); + kputc("+-"[r->is_rev], str); kputc(',', str); + for (k = 0; k < r->n_cigar; ++k) { + kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + } + kputc(',', str); kputw(r->mapq, str); + kputc(',', str); kputw(r->NM, str); + kputc(';', str); + } + } + if (p->alt_sc > 0) + ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc); + } + if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) { + int tmp; + kputsn("\tXR:Z:", 6, str); + tmp = str->l; + kputs(bns->anns[p->rid].anno, str); + for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE + if (str->s[i] == '\t') str->s[i] = ' '; + } + kputc('\n', str); +} + +/************************ + * Integrated interface * + ************************/ + +int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +{ + int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; + double identity; + sub = a->csub > sub? a->csub : sub; + if (sub >= a->score) return 0; + l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; + identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; + if (a->score == 0) { + mapq = 0; + } else if (opt->mapQ_coef_len > 0) { + double tmp; + tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); + tmp *= identity * identity; + mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); + } else { + mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + } + if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); + if (mapq > 60) mapq = 60; + if (mapq < 0) mapq = 0; + mapq = (int)(mapq * (1. - a->frac_rep) + .499); + return mapq; +} + +// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible +void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +{ + extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); + kstring_t str; + kvec_t(mem_aln_t) aa; + int k, l; + char **XA = 0; + + if (!(opt->flag & MEM_F_ALL)) + XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); + kv_init(aa); + str.l = str.m = 0; str.s = 0; + for (k = l = 0; k < a->n; ++k) { + mem_alnreg_t *p = &a->a[k]; + mem_aln_t *q; + if (p->score < opt->T) continue; + if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue; + if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + assert(q->rid >= 0); // this should not happen with the new code + q->XA = XA? XA[k] : 0; + q->flag |= extra_flag; // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score + if (l && p->secondary < 0) // if supplementary + q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; + if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + ++l; + } + if (aa.n == 0) { // no alignments good enough; then write an unaligned record + mem_aln_t t; + t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + t.flag |= extra_flag; + mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); + } else { + for (k = 0; k < aa.n; ++k) + mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); + for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + free(aa.a); + } + s->sam = str.s; + if (XA) { + for (k = 0; k < a->n; ++k) free(XA[k]); + free(XA); + } +} + +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) +{ + + int i; + mem_chain_v chn; + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) + // convert to 2-bit encoding if we have not done so + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + + chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, buf); + chn.n = mem_chain_flt(opt, chn.n, chn.a); + mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a); + if (bwa_verbose >= 4) mem_print_chain(bns, &chn); + + kv_init(regs); + for (i = 0; i < chn.n; ++i) { + mem_chain_t *p = &chn.a[i]; + if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); + mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); + free(chn.a[i].seeds); + } + free(chn.a); + regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a); + if (opt->flag & MEM_F_SELF_OVLP) + regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); + if (bwa_verbose >= 4) { + err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); + } + } + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + if (p->rid >= 0 && bns->anns[p->rid].is_alt) + p->is_alt = 1; + } + return regs; +} + +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) +{ + mem_aln_t a; + int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; + int64_t pos, rb, re; + uint8_t *query; + + memset(&a, 0, sizeof(mem_aln_t)); + if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; + } + qb = ar->qb, qe = ar->qe; + rb = ar->rb, re = ar->re; + query = malloc(l_query); + for (i = 0; i < l_query; ++i) // convert to the nt4 encoding + query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; + a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; + if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment + tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); + w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); + w2 = w2 > tmp? w2 : tmp; + if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); + if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; + i = 0; a.cigar = 0; + do { + free(a.cigar); + w2 = w2 < opt->w<<2? w2 : opt->w<<2; + a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); + if (score == last_sc || w2 == opt->w<<2) break; // it is possible that global alignment and local alignment give different scores + last_sc = score; + w2 <<= 1; + } while (++i < 3 && score < ar->truesc - opt->a); + l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; + a.NM = NM; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + a.is_rev = is_rev; + if (a.n_cigar > 0) { // squeeze out leading or trailing deletions + if ((a.cigar[0]&0xf) == 2) { + pos += a.cigar[0]>>4; + --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) { + --a.n_cigar; + memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly + } + } + if (qb != 0 || qe != l_query) { // add clipping to CIGAR + int clip5, clip3; + clip5 = is_rev? l_query - qe : qb; + clip3 = is_rev? qb : l_query - qe; + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD); + if (clip5) { + memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping + a.cigar[0] = clip5<<4 | 3; + ++a.n_cigar; + } + if (clip3) { + memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping + a.cigar[a.n_cigar++] = clip3<<4 | 3; + } + } + a.rid = bns_pos2rid(bns, pos); + assert(a.rid == ar->rid); + a.pos = pos - bns->anns[a.rid].offset; + a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; + a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc; + free(query); + return a; +} + +typedef struct { + const mem_opt_t *opt; + const bwt_t *bwt; + const bntseq_t *bns; + const uint8_t *pac; + const mem_pestat_t *pes; + smem_aux_t **aux; + bseq1_t *seqs; + mem_alnreg_v *regs; + int64_t n_processed; +} worker_t; + +static void worker1(void *data, int i, int tid) +{ + //JEREMIAH + //bwa_verbose = 4; + worker_t *w = (worker_t*)data; + if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); + w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); + } else { + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); + } +} + +static void worker2(void *data, int i, int tid) +{ + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); + worker_t *w = (worker_t*)data; + if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); + if (w->opt->flag & MEM_F_ALN_REG) { + mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + } else { + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); + mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + } + free(w->regs[i].a); + } else { + if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); + free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); + } +} + +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +{ + + //JEREMIAH + bwa_verbose = 0; + extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); + worker_t w; + mem_pestat_t pes[4]; + double ctime, rtime; + int i; + + ctime = cputime(); rtime = realtime(); + global_bns = bns; + w.regs = malloc(n * sizeof(mem_alnreg_v)); + w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; + w.seqs = seqs; w.n_processed = n_processed; + w.pes = &pes[0]; + w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); + + //JEREMIAH + //fprintf(stdout, "mem_proc address 0: %x 1: %x\n", &seqs[0].seq, &seqs[1].seq); + //fprintf(stdout, "mem_proc address 0: %x 1: %x\n", &w.seqs[0].seq, &w.seqs[1].seq); + //for (i = 0; i < n; i++) + // fprintf(stdout, "w.seqs[i].seq: %s\n", w.seqs[i].seq); + + for (i = 0; i < opt->n_threads; ++i) + w.aux[i] = smem_aux_init(); + kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions + for (i = 0; i < opt->n_threads; ++i) + smem_aux_destroy(w.aux[i]); + free(w.aux); + if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided + if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 + else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data + } + kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment + free(w.regs); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); +} diff --git a/src/bwa/bwamem.h b/src/bwa/bwamem.h new file mode 100644 index 000000000..e861f8628 --- /dev/null +++ b/src/bwa/bwamem.h @@ -0,0 +1,186 @@ +#ifndef BWAMEM_H_ +#define BWAMEM_H_ + +#include "bwt.h" +#include "bntseq.h" +#include "bwa.h" + +#define MEM_MAPQ_COEF 30.0 +#define MEM_MAPQ_MAX 60 + +struct __smem_i; +typedef struct __smem_i smem_i; + +#define MEM_F_PE 0x2 +#define MEM_F_NOPAIRING 0x4 +#define MEM_F_ALL 0x8 +#define MEM_F_NO_MULTI 0x10 +#define MEM_F_NO_RESCUE 0x20 +#define MEM_F_SELF_OVLP 0x40 +#define MEM_F_ALN_REG 0x80 +#define MEM_F_REF_HDR 0x100 +#define MEM_F_SOFTCLIP 0x200 +#define MEM_F_SMARTPE 0x400 + +typedef struct { + int a, b; // match score and mismatch penalty + int o_del, e_del; + int o_ins, e_ins; + int pen_unpaired; // phred-scaled penalty for unpaired reads + int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. + int w; // band width + int zdrop; // Z-dropoff + + uint64_t max_mem_intv; + + int T; // output score threshold; only affecting output + int flag; // see MEM_F_* macros + int min_seed_len; // minimum seed length + int min_chain_weight; + int max_chain_extend; + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor + int split_width; // split into a seed if its occurence is smaller than this value + int max_occ; // skip a seed if its occurence is larger than this value + int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed + int n_threads; // number of threads + int chunk_size; // process chunk_size-bp sequences in a batch + float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits + float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain + float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag + float mask_level_redun; + float mapQ_coef_len; + int mapQ_coef_fac; + int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value + int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end + int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset +} mem_opt_t; + +typedef struct { + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int rid; // reference seq ID + int score; // best local SW score + int truesc; // actual score corresponding to the aligned region; possibly smaller than $score + int sub; // 2nd best SW score + int alt_sc; + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int w; // actual band width used in extension + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary + int secondary_all; + int seedlen0; // length of the starting seed + int n_comp:30, is_alt:2; // number of sub-alignments chained together + float frac_rep; + uint64_t hash; +} mem_alnreg_t; + +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + +typedef struct { + int low, high; // lower and upper bounds within which a read pair is considered to be properly paired + int failed; // non-zero if the orientation is not supported by sufficient data + double avg, std; // mean and stddev of the insert size distribution +} mem_pestat_t; + +typedef struct { // This struct is only used for the convenience of API. + int64_t pos; // forward strand 5'-end mapping position + int rid; // reference sequence index in bntseq_t; <0 for unmapped + int flag; // extra flag + uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance + int n_cigar; // number of CIGAR operations + uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + char *XA; // alternative mappings + + int score, sub, alt_sc; +} mem_aln_t; + +#ifdef __cplusplus +extern "C" { +#endif + + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int len, const uint8_t *query); + void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv); + const bwtintv_v *smem_next(smem_i *itr); + + mem_opt_t *mem_opt_init(void); + void mem_fill_scmat(int a, int b, int8_t mat[25]); + + /** + * Align a batch of sequences and generate the alignments in the SAM format + * + * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. + * Note that $seqs[i].sam may consist of several SAM lines if the + * corresponding sequence has multiple primary hits. + * + * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query + * sequences must be interleaved: $n must be an even number and the 2i-th + * sequence and the (2i+1)-th sequence constitute a read pair. In this + * mode, there should be enough (typically >50) unique pairs for the + * routine to infer the orientation and insert size. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param n number of query sequences + * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, + * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. + */ + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + + /** + * Find the aligned regions for one query sequence + * + * Note that this routine does not generate CIGAR. CIGAR should be + * generated later by mem_reg2aln() below. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence + * + * @return list of aligned regions. + */ + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); + mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); + /** + * Generate CIGAR and forward-strand position from alignment region + * + * @param opt alignment parameters + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence + * @param ar one alignment region + * + * @return CIGAR, strand, mapping quality and forward-strand position + */ + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); + mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); + + /** + * Infer the insert size distribution from interleaved alignment regions + * + * This function can be called after mem_align1(), as long as paired-end + * reads are properly interleaved. + * + * @param opt alignment parameters + * @param l_pac length of concatenated reference sequence + * @param n number of query sequences; must be an even number + * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair + * @param pes inferred insert size distribution (output) + */ + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bwamem_extra.c b/src/bwa/bwamem_extra.c new file mode 100644 index 000000000..36886346b --- /dev/null +++ b/src/bwa/bwamem_extra.c @@ -0,0 +1,167 @@ +#include +#include "bwa.h" +#include "bwamem.h" +#include "bntseq.h" +#include "kstring.h" + +/*************************** + * SMEM iterator interface * + ***************************/ + +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + int min_intv, max_len; + uint64_t max_intv; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); + itr->min_intv = 1; + itr->max_len = INT_MAX; + itr->max_intv = 0; + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); + free(itr); +} + +void smem_set_query(smem_i *itr, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; +} + +void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv) +{ + itr->min_intv = min_intv; + itr->max_len = max_len; + itr->max_intv = max_intv; +} + +const bwtintv_v *smem_next(smem_i *itr) +{ + int ori_start; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return 0; + ori_start = itr->start; + itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM + return itr->matches; +} + +/*********************** + *** Extra functions *** + ***********************/ + +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) +{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence + extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + mem_alnreg_v ar; + + char *seq; + seq = malloc(l_seq); + memcpy(seq, seq_, l_seq); // makes a copy of seq_ + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0); + + mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); + free(seq); + return ar; +} + +void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +{ + int i; + kstring_t str = {0,0,0}; + for (i = 0; i < a->n; ++i) { + const mem_alnreg_t *p = &a->a[i]; + int is_rev, rid, qb = p->qb, qe = p->qe; + int64_t pos, rb = p->rb, re = p->re; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + rid = bns_pos2rid(bns, pos); + assert(rid == p->rid); + pos -= bns->anns[rid].offset; + kputs(s->name, &str); kputc('\t', &str); + kputw(s->l_seq, &str); kputc('\t', &str); + if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap + kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); + kputw(bns->anns[rid].len, &str); kputc('\t', &str); + kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); + ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); + kputc('\n', &str); + } + s->sam = str.s; +} + +static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i) +{ + int k = a[i].secondary_all; + if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k; + return -1; +} + +// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. +char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() +{ + int i, k, r, *cnt, tot; + kstring_t *aln = 0, str = {0,0,0}; + char **XA = 0, *has_alt; + + cnt = calloc(a->n, sizeof(int)); + has_alt = calloc(a->n, 1); + for (i = 0, tot = 0; i < a->n; ++i) { + r = get_pri_idx(opt->XA_drop_ratio, a->a, i); + if (r >= 0) { + ++cnt[r], ++tot; + if (a->a[i].is_alt) has_alt[r] = 1; + } + } + if (tot == 0) goto end_gen_alt; + aln = calloc(a->n, sizeof(kstring_t)); + for (i = 0; i < a->n; ++i) { + mem_aln_t t; + if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue; + if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue; + t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); + str.l = 0; + kputs(bns->anns[t.rid].name, &str); + kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str); + kputc(',', &str); + for (k = 0; k < t.n_cigar; ++k) { + kputw(t.cigar[k]>>4, &str); + kputc("MIDSHN"[t.cigar[k]&0xf], &str); + } + kputc(',', &str); kputw(t.NM, &str); + kputc(';', &str); + free(t.cigar); + kputsn(str.s, str.l, &aln[r]); + } + XA = calloc(a->n, sizeof(char*)); + for (k = 0; k < a->n; ++k) + XA[k] = aln[k].s; + +end_gen_alt: + free(has_alt); free(cnt); free(aln); free(str.s); + return XA; +} diff --git a/src/bwa/bwamem_pair.c b/src/bwa/bwamem_pair.c new file mode 100644 index 000000000..7950736be --- /dev/null +++ b/src/bwa/bwamem_pair.c @@ -0,0 +1,388 @@ +#include +#include +#include +#include +#include "kstring.h" +#include "bwamem.h" +#include "kvec.h" +#include "utils.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + +#define MIN_RATIO 0.8 +#define MIN_DIR_CNT 10 +#define MIN_DIR_RATIO 0.05 +#define OUTLIER_BOUND 2.0 +#define MAPPING_BOUND 3.0 +#define MAX_STDDEV 4.0 + +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + +static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) +{ + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap + } + } + return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; +} + +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) +{ + int i, d, max; + uint64_v isize[4]; + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + memset(isize, 0, sizeof(kvec_t(int)) * 4); + for (i = 0; i < n>>1; ++i) { + int dir; + int64_t is; + mem_alnreg_v *r[2]; + r[0] = (mem_alnreg_v*)®s[i<<1|0]; + r[1] = (mem_alnreg_v*)®s[i<<1|1]; + if (r[0]->n == 0 || r[1]->n == 0) continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr + dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); + if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); + } + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. + mem_pestat_t *r = &pes[d]; + uint64_v *q = &isize[d]; + int p25, p50, p75, x; + if (q->n < MIN_DIR_CNT) { + fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + r->failed = 1; + free(q->a); + continue; + } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + ks_introsort_64(q->n, q->a); + p25 = q->a[(int)(.25 * q->n + .499)]; + p50 = q->a[(int)(.50 * q->n + .499)]; + p75 = q->a[(int)(.75 * q->n + .499)]; + r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + if (r->low < 1) r->low = 1; + r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); + for (i = x = 0, r->avg = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->avg += q->a[i], ++x; + r->avg /= x; + for (i = 0, r->std = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); + r->std = sqrt(r->std / x); + fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); + r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); + r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); + if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); + if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); + if (r->low < 1) r->low = 1; + fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + free(q->a); + } + for (d = 0, max = 0; d < 4; ++d) + max = max > isize[d].n? max : isize[d].n; + for (d = 0; d < 4; ++d) + if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { + pes[d].failed = 1; + fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + } +} + +int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +{ + extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a); + int64_t l_pac = bns->l_pac; + int i, r, skip[4], n = 0, rid; + for (r = 0; r < 4; ++r) + skip[r] = pes[r].failed? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW + for (r = 0; r < 4; ++r) { + int is_rev, is_larger; + uint8_t *seq, *rev = 0, *ref = 0; + int64_t rb, re; + if (skip[r]) continue; + is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate + is_larger = !(r>>1); // whether the mate has larger coordinate + if (is_rev) { + rev = malloc(l_ms); // this is the reverse complement of $ms + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; + seq = rev; + } else seq = (uint8_t*)ms; + if (!is_rev) { + rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; + re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length + } else { + rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; + } + if (rb < 0) rb = 0; + if (re > l_pac<<1) re = l_pac<<1; + if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid); + if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening + kswr_t aln; + mem_alnreg_t b; + int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); + memset(&b, 0, sizeof(mem_alnreg_t)); + if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 + b.rid = a->rid; + b.is_alt = a->is_alt; + b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; + b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; + b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; + } + ++n; + } + if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a); + if (rev) free(rev); + free(ref); + } + return n; +} + +int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2]) +{ + pair64_v v, u; + int r, i, k, y[4], ret; // y[] keeps the last hit + int64_t l_pac = bns->l_pac; + kv_init(v); kv_init(u); + for (r = 0; r < 2; ++r) { // loop through read number + for (i = 0; i < n_pri[r]; ++i) { + pair64_t key; + mem_alnreg_t *e = &a[r].a[i]; + key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position + key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset); + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + kv_push(pair64_t, v, key); + } + } + ks_introsort_128(v.n, v.a); + y[0] = y[1] = y[2] = y[3] = -1; + //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); + for (i = 0; i < v.n; ++i) { + for (r = 0; r < 2; ++r) { // loop through direction + int dir = r<<1 | (v.a[i].y>>1&1), which; + if (pes[dir].failed) continue; // invalid orientation + which = r<<1 | ((v.a[i].y&1)^1); + if (y[which] < 0) continue; // no previous hits + for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) + int64_t dist; + int q; + double ns; + pair64_t *p; + if ((v.a[k].y&3) != which) continue; + dist = (int64_t)v.a[i].x - v.a[k].x; + //printf("%d: %lld\n", k, dist); + if (dist > pes[dir].high) break; + if (dist < pes[dir].low) continue; + ns = (dist - pes[dir].avg) / pes[dir].std; + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4) + if (q < 0) q = 0; + p = kv_pushp(pair64_t, u); + p->y = (uint64_t)k<<32 | i; + p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); + //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); + } + } + y[v.a[i].y&3] = i; + } + if (u.n) { // found at least one proper pair + int tmp = opt->a + opt->b; + tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del; + tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins; + ks_introsort_128(u.n, u.a); + i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; + z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair + z[v.a[k].y&1] = v.a[k].y<<32>>34; + ret = u.a[u.n-1].x >> 32; + *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; + for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) + if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; + } else ret = 0, *sub = 0, *n_sub = 0; + free(u.a); free(v.a); + return ret; +} + +void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); + +#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) + +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) +{ + extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); + extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); + + int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2]; + kstring_t str; + mem_aln_t h[2], g[2], aa[2][2]; + + str.l = str.m = 0; str.s = 0; + memset(h, 0, sizeof(mem_aln_t) * 2); + memset(g, 0, sizeof(mem_aln_t) * 2); + n_aa[0] = n_aa[1] = 0; + if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment + mem_alnreg_v b[2]; + kv_init(b[0]); kv_init(b[1]); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); + } + n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); + n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); + if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; + // pairing single-end hits + if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) { + int is_multi[2], q_pe, score_un, q_se[2]; + char **XA[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < n_pri[i]; ++j) + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; + is_multi[i] = j < n_pri[i]? 1 : 0; + } + if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un? subo : score_un; + q_pe = raw_mapq(o - subo, opt->a); + if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); + if (q_pe < 0) q_pe = 0; + if (q_pe > 60) q_pe = 60; + q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499); + // the following assumes no split hits + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t *c[2]; + c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); + } + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); + q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); + } + for (i = 0; i < 2; ++i) { + int k = a[i].a[z[i]].secondary_all; + if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT + assert(a[i].a[k].secondary_all < 0); + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].secondary_all == k || j == k) + a[i].a[j].secondary_all = z[i]; + a[i].a[z[i]].secondary_all = -1; + } + } + if (!(opt->flag & MEM_F_ALL)) { + for (i = 0; i < 2; ++i) + XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq); + } else XA[0] = XA[1] = 0; + // write SAM + for (i = 0; i < 2; ++i) { + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); + h[i].mapq = q_se[i]; + h[i].flag |= 0x40<score < opt->T || p->secondary >= 0 || !p->is_alt) continue; + g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p); + g[i].flag |= 0x800 | 0x40<= opt->T) which = 0; + else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T) + which = n_pri[i]; + } + if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]); + else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); + } + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + int64_t dist; + int d; + d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); + if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; + } + mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); + mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + free(h[0].cigar); free(h[1].cigar); + return n; +} diff --git a/src/bwa/bwape.c b/src/bwa/bwape.c new file mode 100644 index 000000000..a5dc3ad4a --- /dev/null +++ b/src/bwa/bwape.c @@ -0,0 +1,783 @@ +#include +#include +#include +#include +#include +#include +#include "bwtaln.h" +#include "kvec.h" +#include "bntseq.h" +#include "utils.h" +#include "bwase.h" +#include "bwa.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef struct { + int n; + bwtint_t *a; +} poslist_t; + +typedef struct { + double avg, std, ap_prior; + bwtint_t low, high, high_bayesian; +} isize_info_t; + +#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) +#define b128_hash(a) ((uint32_t)(a).x) + +#include "khash.h" +KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) + +typedef struct { + pair64_v arr; + pair64_v pos[2]; + kvec_t(bwt_aln1_t) aln[2]; +} pe_data_t; + +#define MIN_HASH_WIDTH 1000 + +extern int g_log_n[256]; // in bwase.c +static kh_b128_t *g_hash; + +void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +int bwa_approx_mapQ(const bwa_seq_t *p, int mm); +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); +bntseq_t *bwa_open_nt(const char *prefix); +void bwa_print_sam_SQ(const bntseq_t *bns); + +pe_opt_t *bwa_init_pe_opt() +{ + pe_opt_t *po; + po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); + po->max_isize = 500; + po->force_isize = 0; + po->max_occ = 100000; + po->n_multi = 3; + po->N_multi = 10; + po->type = BWA_PET_STD; + po->is_sw = 1; + po->ap_prior = 1e-5; + return po; +} +/* +static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); +{ + const double a = 0.140012; + double b, c; + b = log(x * (2 - x)); + c = 2./M_PI/a + b / 2.; + return sqrt(sqrt(c * c - b / a) - c); +} +*/ + +// for normal distribution, this is about 3std +#define OUTLIER_BOUND 2.0 + +static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L) +{ + uint64_t x, *isizes, n_ap = 0; + int n, i, tot, p25, p75, p50, max_len = 1, tmp; + double skewness = 0.0, kurtosis = 0.0, y; + + ii->avg = ii->std = -1.0; + ii->low = ii->high = ii->high_bayesian = 0; + isizes = (uint64_t*)calloc(n_seqs, 8); + for (i = 0, tot = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) { + x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos; + if (x < 100000) isizes[tot++] = x; + } + if (p[0]->len > max_len) max_len = p[0]->len; + if (p[1]->len > max_len) max_len = p[1]->len; + } + if (tot < 20) { + fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n"); + free(isizes); + return -1; + } + ks_introsort_64(tot, isizes); + p25 = isizes[(int)(tot*0.25 + 0.5)]; + p50 = isizes[(int)(tot*0.50 + 0.5)]; + p75 = isizes[(int)(tot*0.75 + 0.5)]; + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned + ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (ii->low > ii->high) { + fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n"); + free(isizes); + return -1; + } + for (i = 0, x = n = 0; i < tot; ++i) + if (isizes[i] >= ii->low && isizes[i] <= ii->high) + ++n, x += isizes[i]; + ii->avg = (double)x / n; + for (i = 0; i < tot; ++i) { + if (isizes[i] >= ii->low && isizes[i] <= ii->high) { + double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg); + ii->std += tmp; + skewness += tmp * (isizes[i] - ii->avg); + kurtosis += tmp * tmp; + } + } + kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3; + ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large + skewness = skewness / n / (ii->std * ii->std * ii->std); + for (y = 1.0; y < 10.0; y += 0.01) + if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; + ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); + for (i = 0; i < tot; ++i) + if (isizes[i] > ii->high_bayesian) ++n_ap; + ii->ap_prior = .01 * (n_ap + .01) / tot; + if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior; + free(isizes); + fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75); + if (isnan(ii->std) || p75 > 100000) { + ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0; + fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n"); + return -1; + } + for (y = 1.0; y < 10.0; y += 0.01) + if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; + ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); + fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high); + fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); + fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior); + fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y); + return 0; +} + +static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) +{ + int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; + uint64_t o_score, subo_score; + pair64_t last_pos[2][2], o_pos[2]; + max_len = p[0]->full_len; + if (max_len < p[1]->full_len) max_len = p[1]->full_len; + if (low_bound < max_len) low_bound = max_len; + + // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize +#define __pairing_aux(u,v) do { \ + bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \ + if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \ + && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ + { \ + uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \ + s *= 10; \ + if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ + s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \ + if (s>>32 == o_score>>32) ++o_n; \ + else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ + else ++subo_n; \ + if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \ + else if (s < subo_score) subo_score = s; \ + } \ + } while (0) + +#define __pairing_aux2(q, w) do { \ + const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \ + (q)->extra_flag |= SAM_FPP; \ + if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \ + (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \ + (q)->score = r->score; \ + (q)->pos = (w).x; \ + if ((q)->mapQ > 0) ++cnt_chg; \ + } \ + } while (0) + + o_score = subo_score = (uint64_t)-1; + o_n = subo_n = 0; + ks_introsort_128(d->arr.n, d->arr.a); + for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; + if (opt->type == BWA_PET_STD) { + for (i = 0; i < d->arr.n; ++i) { + pair64_t x = d->arr.a[i]; + int strand = x.y>>1&1; + if (strand == 1) { // reverse strand, then check + int y = 1 - (x.y&1); + __pairing_aux(last_pos[y][1], x); + __pairing_aux(last_pos[y][0], x); + } else { // forward strand, then push + last_pos[x.y&1][0] = last_pos[x.y&1][1]; + last_pos[x.y&1][1] = x; + } + } + } else { + fprintf(stderr, "[paring] not implemented yet!\n"); + exit(1); + } + // set pairing + //fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); + if (o_score != (uint64_t)-1) { + int mapQ_p = 0; // this is the maximum mapping quality when one end is moved + //fprintf(stderr, "%d, %d\n", o_n, subo_n); + if (o_n == 1) { + if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair + else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair + else { + int n = subo_n > 255? 255 : subo_n; + mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n]; + if (mapQ_p < 0) mapQ_p = 0; + } + } + if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved + if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { + int mapQ = p[0]->mapQ + p[1]->mapQ; + if (mapQ > 60) mapQ = 60; + p[0]->mapQ = p[1]->mapQ = mapQ; + } else { + if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; + if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; + } + } else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved + p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; + if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; + } else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved + p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; + if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; + } else { // both ends moved + p[0]->seQ = p[1]->seQ = 0; + mapQ_p -= 20; + if (mapQ_p < 0) mapQ_p = 0; + p[0]->mapQ = p[1]->mapQ = mapQ_p; + } + __pairing_aux2(p[0], o_pos[0]); + __pairing_aux2(p[1], o_pos[1]); + } + return cnt_chg; +} + +typedef struct { + kvec_t(bwt_aln1_t) aln; +} aln_buf_t; + +int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, + const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) +{ + int i, j, cnt_chg = 0; + char str[1024]; + bwt_t *bwt; + pe_data_t *d; + aln_buf_t *buf[2]; + + d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); + buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + + if (_bwt == 0) { // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + } else bwt = _bwt; + + // SE + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + for (j = 0; j < 2; ++j) { + int n_aln; + p[j] = seqs[j] + i; + p[j]->n_multi = 0; + p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); + err_fread_noeof(&n_aln, 4, 1, fp_sa[j]); + if (n_aln > kv_max(d->aln[j])) + kv_resize(bwt_aln1_t, d->aln[j], n_aln); + d->aln[j].n = n_aln; + err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); + kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] + // generate SE alignment and mapping quality + bwa_aln2seq(n_aln, d->aln[j].a, p[j]); + if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { + int strand; + int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; + p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); + p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand); + p[j]->strand = strand; + if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH; + } + } + } + + // infer isize + infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2); + if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; + if (opt->force_isize) { + fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__); + ii->low = ii->high = 0; ii->avg = ii->std = -1.0; + } + + // PE + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + for (j = 0; j < 2; ++j) { + p[j] = seqs[j] + i; + kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln); + } + if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) + && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) + { // only when both ends mapped + pair64_t x; + int j, k; + long long n_occ[2]; + for (j = 0; j < 2; ++j) { + n_occ[j] = 0; + for (k = 0; k < d->aln[j].n; ++k) + n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1; + } + if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue; + d->arr.n = 0; + for (j = 0; j < 2; ++j) { + for (k = 0; k < d->aln[j].n; ++k) { + bwt_aln1_t *r = d->aln[j].a + k; + bwtint_t l; + if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table + pair64_t key; + int ret; + key.x = r->k; key.y = r->l; + khint_t iter = kh_put(b128, g_hash, key, &ret); + if (ret) { // not in the hash table; ret must equal 1 as we never remove elements + poslist_t *z = &kh_val(g_hash, iter); + z->n = r->l - r->k + 1; + z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); + for (l = r->k; l <= r->l; ++l) { + int strand; + z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1; + z->a[l - r->k] |= strand; + } + } + for (l = 0; l < kh_val(g_hash, iter).n; ++l) { + x.x = kh_val(g_hash, iter).a[l]>>1; + x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; + kv_push(pair64_t, d->arr, x); + } + } else { // then calculate on the fly + for (l = r->k; l <= r->l; ++l) { + int strand; + x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand); + x.y = k<<2 | strand<<1 | j; + kv_push(pair64_t, d->arr, x); + } + } + } + } + cnt_chg += pairing(p, d, opt, gopt->s_mm, ii); + } + + if (opt->N_multi || opt->n_multi) { + for (j = 0; j < 2; ++j) { + if (p[j]->type != BWA_TYPE_NO_MATCH) { + int k, n_multi; + if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { + bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); + } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); + for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { + int strand; + bwt_multi1_t *q = p[j]->multi + k; + q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand); + q->strand = strand; + if (q->pos != p[j]->pos) + p[j]->multi[n_multi++] = *q; + } + p[j]->n_multi = n_multi; + } + } + } + } + + // free + for (i = 0; i < n_seqs; ++i) { + kv_destroy(buf[0][i].aln); + kv_destroy(buf[1][i].aln); + } + free(buf[0]); free(buf[1]); + if (_bwt == 0) bwt_destroy(bwt); + kv_destroy(d->arr); + kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); + kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); + free(d); + return cnt_chg; +} + +#define SW_MIN_MATCH_LEN 20 +#define SW_MIN_MAPQ 17 + +// cnt = n_mm<<16 | n_gapo<<8 | n_gape +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) +{ + kswr_t r; + uint32_t *cigar32 = 0; + bwa_cigar_t *cigar = 0; + ubyte_t *ref_seq; + bwtint_t k, x, y, l; + int xtra, gscore; + int8_t mat[25]; + + bwa_fill_scmat(1, 3, mat); + // check whether there are too many N's + if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; + for (k = 0, x = 0; k < len; ++k) + if (seq[k] >= 4) ++x; + if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; + + // get reference subsequence + ref_seq = (ubyte_t*)calloc(reglen, 1); + for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + + // do alignment + xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); + r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); + gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment + free(cigar); free(ref_seq); *n_cigar = 0; + return 0; + } + + // check whether the alignment is good enough + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + bwa_cigar_t c = cigar[k]; + if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c); + else if (__cigar_op(c) == FROM_D) x += __cigar_len(c); + else y += __cigar_len(c); + } + if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough + free(cigar); free(ref_seq); + *n_cigar = 0; + return 0; + } + + { // update cigar and coordinate; + int start = r.qb, end = r.qe + 1; + *beg += r.tb; + cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); + if (start) { + memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); + cigar[0] = __cigar_create(3, start); + ++(*n_cigar); + } + if (end < len) { + /*cigar[*n_cigar] = 3<<14 | (len - end);*/ + cigar[*n_cigar] = __cigar_create(3, (len - end)); + ++(*n_cigar); + } + } + + { // set *cnt + int n_mm, n_gapo, n_gape; + n_mm = n_gapo = n_gape = 0; + x = r.tb; y = r.qb; + for (k = 0; k < *n_cigar; ++k) { + bwa_cigar_t c = cigar[k]; + if (__cigar_op(c) == FROM_M) { + for (l = 0; l < (__cigar_len(c)); ++l) + if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm; + x += __cigar_len(c), y += __cigar_len(c); + } else if (__cigar_op(c) == FROM_D) { + x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; + } else if (__cigar_op(c) == FROM_I) { + y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; + } + } + *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; + } + + free(ref_seq); + return cigar; +} + +ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii) +{ + ubyte_t *pacseq; + int i; + uint64_t n_tot[2], n_mapped[2]; + + // load reference sequence + if (_pacseq == 0) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + err_rewind(bns->fp_pac); + err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = (ubyte_t*)_pacseq; + if (!popt->is_sw || ii->avg < 0.0) return pacseq; + + // perform mate alignment + n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ + int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2]; + int64_t beg[2], end[2]; + bwa_cigar_t *cigar[2]; + uint32_t cnt[2]; + + /* In the following, _pref points to the reference read + * which must be aligned; _pmate points to its mate which is + * considered to be modified. */ + +#define __set_rght_coor(_a, _b, _pref, _pmate) do { \ + (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \ + (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ + if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \ + if ((_b) > bns->l_pac) (_b) = bns->l_pac; \ + } while (0) + +#define __set_left_coor(_a, _b, _pref, _pmate) do { \ + (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \ + (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ + if ((_a) < 0) (_a) = 0; \ + if ((_b) > _pref->pos) (_b) = _pref->pos; \ + } while (0) + +#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \ + _pmate->type = BWA_TYPE_MATESW; \ + _pmate->pos = _beg; \ + _pmate->seQ = _pref->seQ; \ + _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \ + _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \ + _pmate->extra_flag |= SAM_FPP; \ + _pref->extra_flag |= SAM_FPP; \ + } while (0) + + mq_adjust[0] = mq_adjust[1] = 255; // not effective + is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0; + + ++n_tot[is_singleton]; + cigar[0] = cigar[1] = 0; + n_cigar[0] = n_cigar[1] = 0; + if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered + for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified + ubyte_t *seq; + if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip + { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads + if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate + __set_rght_coor(beg[k], end[k], p[1-k], p[k]); + seq = p[k]->rseq; + } else { // then the mate is on forward stand and has smaller coordinate + __set_left_coor(beg[k], end[k], p[1-k], p[k]); + seq = p[k]->seq; + seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly + } + } + // perform SW alignment + cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); + if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k] + int s_old, clip = 0, s_new; + if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]); + if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]); + s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499); + s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499); + s_old += -4.343 * log(ii->ap_prior / bns->l_pac); + s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma + if (s_old < s_new) { // reject SW alignment + mq_adjust[k] = s_new - s_old; + free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0; + } else mq_adjust[k] = s_old - s_new; + } + // now revserse sequence back such that p[*]->seq looks untouched + if (popt->type == BWA_PET_STD) { + if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0); + } else { + if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0); + } + } + k = -1; // no read to be changed + if (cigar[0] && cigar[1]) { + k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed + mapQ = abs(p[1]->mapQ - p[0]->mapQ); + } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ; + else if (cigar[1]) k = 1, mapQ = p[0]->mapQ; + if (k >= 0 && p[k]->pos != beg[k]) { + ++n_mapped[is_singleton]; + { // recalculate mapping quality + int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8; + if (tmp <= 0) tmp = 1; + if (mapQ > tmp) mapQ = tmp; + p[k]->mapQ = p[1-k]->mapQ = mapQ; + p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ; + if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k]; + if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k]; + } + // update CIGAR + free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0; + p[k]->n_cigar = n_cigar[k]; + // update the rest of information + __set_fixed(p[1-k], p[k], beg[k], cnt[k]); + } + free(cigar[0]); free(cigar[1]); + } + } + fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n", + (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ); + fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n", + (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ); + return pacseq; +} + +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) +{ + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, j, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs[2]; + bwa_seqio_t *ks[2]; + clock_t t; + bntseq_t *bns; + FILE *fp_sa[2]; + gap_opt_t opt, opt0; + khint_t iter; + isize_info_t last_ii; // this is for the last batch of reads + char str[1024], magic[2][4]; + bwt_t *bwt; + uint8_t *pac; + + // initialization + bwase_initialize(); // initialize g_log_n[] in bwase.c + pac = 0; bwt = 0; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); + bns = bns_restore(prefix); + srand48(bns->seed); + fp_sa[0] = xopen(fn_sa[0], "r"); + fp_sa[1] = xopen(fn_sa[1], "r"); + g_hash = kh_init(b128); + last_ii.avg = -1.0; + + err_fread_noeof(magic[0], 1, 4, fp_sa[0]); + err_fread_noeof(magic[1], 1, 4, fp_sa[1]); + if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); + ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); + opt0 = opt; + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! + ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); + { // for Illumina alignment only + if (popt->is_preload) { + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + err_rewind(bns->fp_pac); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); + } + } + + // core loop + bwa_print_sam_hdr(bns, rg_line); + while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { + int cnt_chg; + isize_info_t ii; + ubyte_t *pacseq; + + seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual); + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); + cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); + fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); + + fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n"); + pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii); + fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); + for (j = 0; j < 2; ++j) + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + if (pac == 0) free(pacseq); + + fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if (p[0]->bc[0] || p[1]->bc[0]) { + strcat(p[0]->bc, p[1]->bc); + strcpy(p[1]->bc, p[0]->bc); + } + bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); + bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); + if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + for (j = 0; j < 2; ++j) + bwa_free_read_seq(n_seqs, seqs[j]); + fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs); + last_ii = ii; + } + + // destroy + bns_destroy(bns); + for (i = 0; i < 2; ++i) { + bwa_seq_close(ks[i]); + err_fclose(fp_sa[i]); + } + for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) + if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); + kh_destroy(b128, g_hash); + if (pac) { + free(pac); bwt_destroy(bwt); + } +} + +int bwa_sai2sam_pe(int argc, char *argv[]) +{ + int c; + pe_opt_t *popt; + char *prefix, *rg_line = 0; + + popt = bwa_init_pe_opt(); + while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { + switch (c) { + case 'r': + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; + break; + case 'a': popt->max_isize = atoi(optarg); break; + case 'o': popt->max_occ = atoi(optarg); break; + case 's': popt->is_sw = 0; break; + case 'P': popt->is_preload = 1; break; + case 'n': popt->n_multi = atoi(optarg); break; + case 'N': popt->N_multi = atoi(optarg); break; + case 'c': popt->ap_prior = atof(optarg); break; + case 'f': xreopen(optarg, "w", stdout); break; + case 'A': popt->force_isize = 1; break; + default: return 1; + } + } + + if (optind + 5 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa sampe [options] \n\n"); + fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); + fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); + fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); + fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); + fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior); + fprintf(stderr, " -f FILE sam file to output results to [stdout]\n"); + fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); + fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); + fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); + fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); + fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); + fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); + fprintf(stderr, "\n"); + return 1; + } + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 1; + } + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); + free(prefix); free(popt); + return 0; +} diff --git a/src/bwa/bwase.c b/src/bwa/bwase.c new file mode 100644 index 000000000..cb912ec50 --- /dev/null +++ b/src/bwa/bwase.c @@ -0,0 +1,602 @@ +#include +#include +#include +#include +#include +#include +#include +#include "bwase.h" +#include "bwtaln.h" +#include "bntseq.h" +#include "utils.h" +#include "kstring.h" +#include "bwa.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +int g_log_n[256]; + +void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +{ + int i, cnt, best; + if (n_aln == 0) { + s->type = BWA_TYPE_NO_MATCH; + s->c1 = s->c2 = 0; + return; + } + + if (set_main) { + best = aln[0].score; + for (i = cnt = 0; i < n_aln; ++i) { + const bwt_aln1_t *p = aln + i; + if (p->score > best) break; + if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { + s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; + s->ref_shift = (int)p->n_del - (int)p->n_ins; + s->score = p->score; + s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); + } + cnt += p->l - p->k + 1; + } + s->c1 = cnt; + for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; + s->c2 = cnt - s->c1; + s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; + } + + if (n_multi) { + int k, rest, n_occ, z = 0; + for (k = n_occ = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + n_occ += q->l - q->k + 1; + } + if (s->multi) free(s->multi); + if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them + s->multi = 0; s->n_multi = 0; + return; + } + /* The following code is more flexible than what is required + * here. In principle, due to the requirement above, we can + * simply output all hits, but the following samples "rest" + * number of random hits. */ + rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa + s->multi = calloc(rest, sizeof(bwt_multi1_t)); + for (k = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + if (q->l - q->k + 1 <= rest) { + bwtint_t l; + for (l = q->k; l <= q->l; ++l) { + s->multi[z].pos = l; + s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; + s->multi[z++].mm = q->n_mm; + } + rest -= q->l - q->k + 1; + } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. + int j, i; + for (j = rest, i = q->l - q->k + 1; j > 0; --j) { + double p = 1.0, x = drand48(); + while (x < p) p -= p * j / (i--); + s->multi[z].pos = q->l - i; + s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; + s->multi[z++].mm = q->n_mm; + } + rest = 0; + break; + } + } + s->n_multi = z; + } +} + +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) +{ + bwa_aln2seq_core(n_aln, aln, s, 1, 0); +} + +int bwa_approx_mapQ(const bwa_seq_t *p, int mm) +{ + int n; + if (p->c1 == 0) return 23; + if (p->c1 > 1) return 0; + if (p->n_mm == mm) return 25; + if (p->c2 == 0) return 37; + n = (p->c2 >= 255)? 255 : p->c2; + return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; +} + +bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand) +{ + bwtint_t pos_f; + int is_rev; + pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate + if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1; + pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base + *strand = !is_rev; + if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base + return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset +} + +/** + * Derive the actual position in the read from the given suffix array + * coordinates. Note that the position will be approximate based on + * whether indels appear in the read and whether calculations are + * performed from the start or end of the read. + */ +void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr) +{ + int max_diff, strand; + if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; + max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + //fprintf(stderr, "%d\n", seq->ref_shift); + seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand); + seq->strand = strand; + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH; +} + +void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) +{ + int i, j, strand, n_multi; + char str[1024]; + bwt_t *bwt; + // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = &seqs[i]; + bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); + for (j = n_multi = 0; j < p->n_multi; ++j) { + bwt_multi1_t *q = p->multi + j; + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand); + q->strand = strand; + if (q->pos != p->pos && q->pos != (bwtint_t)-1) + p->multi[n_multi++] = *q; + } + p->n_multi = n_multi; + } + bwt_destroy(bwt); +} + +#define SW_BW 50 + +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar) +{ + bwa_cigar_t *cigar = 0; + uint32_t *cigar32 = 0; + ubyte_t *rseq; + int64_t k, rb, re, rlen; + int8_t mat[25]; + + bwa_fill_scmat(1, 3, mat); + rb = *_rb; re = rb + len + ref_shift; + assert(re <= l_pac); + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + assert(re - rb == rlen); + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32); + assert(*n_cigar > 0); + if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping + if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping + if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del + if ((cigar32[0]&0xf) == 2) { // delete beginning del + *_rb += cigar32[0]>>4; + --*n_cigar; + memmove(cigar32, cigar32+1, (*n_cigar) * 4); + } + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + free(rseq); + return cigar; +} + +char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, + bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) +{ + bwtint_t x, y; + int z, u, c, nm = 0; + str->l = 0; // reset + x = pos; y = 0; + if (cigar) { + int k, l; + for (k = u = 0; k < n_cigar; ++k) { + l = __cigar_len(cigar[k]); + if (__cigar_op(cigar[k]) == FROM_M) { + for (z = 0; z < l && x+z < l_pac; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + x += l; y += l; + } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { + y += l; + if (__cigar_op(cigar[k]) == FROM_I) nm += l; + } else if (__cigar_op(cigar[k]) == FROM_D) { + ksprintf(str, "%d", u); + kputc('^', str); + for (z = 0; z < l && x+z < l_pac; ++z) + kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); + u = 0; + x += l; nm += l; + } + } + } else { // no gaps + for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + } + ksprintf(str, "%d", u); + *_nm = nm; + return strdup(str->s); +} + +void bwa_correct_trimmed(bwa_seq_t *s) +{ + if (s->len == s->full_len) return; + if (s->strand == 0) { // forward + if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S + s->cigar[s->n_cigar-1] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar[0] = __cigar_create(0, s->len); + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + } + s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); + } + } else { // reverse + if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S + s->cigar[0] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar[1] = __cigar_create(0, s->len); + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); + } + s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); + } + } + s->len = s->full_len; +} + +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) +{ + ubyte_t *pacseq; + int i, j, k; + kstring_t *str; + + if (!_pacseq) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + err_rewind(bns->fp_pac); + err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = _pacseq; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! + for (j = k = 0; j < s->n_multi; ++j) { + bwt_multi1_t *q = s->multi + j; + int n_cigar; + if (q->gap) { // gapped alignment + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); + q->n_cigar = n_cigar; + if (q->cigar) s->multi[k++] = *q; + } else s->multi[k++] = *q; + } + s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation + if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar); + if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; + } + // generate MD tag + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + if (s->type != BWA_TYPE_NO_MATCH) { + int nm; + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); + s->nm = nm; + } + } + free(str->s); free(str); + + // correct for trimmed reads + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + + if (!_pacseq) free(pacseq); +} + +int64_t pos_end(const bwa_seq_t *p) +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = __cigar_op(p->cigar[j]); + if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); + } + return x; + } else return p->pos + p->len; +} + +int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end() +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = __cigar_op(p->cigar[j]); + if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); + } + return x; + } else return p->pos + len; +} + +static int64_t pos_5(const bwa_seq_t *p) +{ + if (p->type != BWA_TYPE_NO_MATCH) + return p->strand? pos_end(p) : p->pos; + return -1; +} + +void bwa_print_seq(FILE *stream, bwa_seq_t *seq) { + char buffer[4096]; + const int bsz = sizeof(buffer); + int i, j, l; + + if (seq->strand == 0) { + for (i = 0; i < seq->full_len; i += bsz) { + l = seq->full_len - i > bsz ? bsz : seq->full_len - i; + for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]]; + err_fwrite(buffer, 1, l, stream); + } + } else { + for (i = seq->full_len - 1; i >= 0; i -= bsz) { + l = i + 1 > bsz ? bsz : i + 1; + for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]]; + err_fwrite(buffer, 1, l, stream); + } + } +} + +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) +{ + int j; + if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { + int seqid, nn, am = 0, flag = p->extra_flag; + char XT; + + if (p->type == BWA_TYPE_NO_MATCH) { + p->pos = mate->pos; + p->strand = mate->strand; + flag |= SAM_FSU; + j = 1; + } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment + + // get seqid + nn = bns_cnt_ambi(bns, p->pos, j, &seqid); + if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) + flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences + + // update flag and print it + if (p->strand) flag |= SAM_FSR; + if (mate) { + if (mate->type != BWA_TYPE_NO_MATCH) { + if (mate->strand) flag |= SAM_FMR; + } else flag |= SAM_FMU; + } + err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); + err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); + + // print CIGAR + if (p->cigar) { + for (j = 0; j != p->n_cigar; ++j) + err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); + } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*"); + else err_printf("%dM", p->len); + + // print mate coordinate + if (mate && mate->type != BWA_TYPE_NO_MATCH) { + int m_seqid; + long long isize; + am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality + // redundant calculation here, but should not matter too much + bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); + err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); + isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; + if (p->type == BWA_TYPE_NO_MATCH) isize = 0; + err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); + } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); + else err_printf("\t*\t0\t0\t"); + + // print sequence and quality + bwa_print_seq(stdout, p); + err_putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + err_printf("%s", p->qual); + } else err_printf("*"); + + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); + if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); + if (p->type != BWA_TYPE_NO_MATCH) { + int i; + // calculate XT tag + XT = "NURM"[p->type]; + if (nn > 10) XT = 'N'; + // print tags + err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); + if (nn) err_printf("\tXN:i:%d", nn); + if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); + if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment + err_printf("\tX0:i:%d", p->c1); + if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2); + } + err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); + if (p->md) err_printf("\tMD:Z:%s", p->md); + // print multiple hits + if (p->n_multi) { + err_printf("\tXA:Z:"); + for (i = 0; i < p->n_multi; ++i) { + bwt_multi1_t *q = p->multi + i; + int k; + j = pos_end_multi(q, p->len) - q->pos; + nn = bns_cnt_ambi(bns, q->pos, j, &seqid); + err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', + (int)(q->pos - bns->anns[seqid].offset + 1)); + if (q->cigar) { + for (k = 0; k < q->n_cigar; ++k) + err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); + } else err_printf("%dM", p->len); + err_printf(",%d;", q->gap + q->mm); + } + } + } + err_putchar('\n'); + } else { // this read has no match + //ubyte_t *s = p->strand? p->rseq : p->seq; + int flag = p->extra_flag | SAM_FSU; + if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; + err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); + //Why did this work differently to the version above?? + //for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); + bwa_print_seq(stdout, p); + err_putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + err_printf("%s", p->qual); + } else err_printf("*"); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); + if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); + err_putchar('\n'); + } +} + +void bwase_initialize() +{ + int i; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); +} + +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) +{ + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, n_seqs, tot_seqs = 0, m_aln; + bwt_aln1_t *aln = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bntseq_t *bns; + FILE *fp_sa; + gap_opt_t opt; + char magic[4]; + + // initialization + bwase_initialize(); + bns = bns_restore(prefix); + srand48(bns->seed); + fp_sa = xopen(fn_sa, "r"); + + m_aln = 0; + err_fread_noeof(magic, 1, 4, fp_sa); + if (strncmp(magic, SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); + bwa_print_sam_hdr(bns, rg_line); + // set ks + ks = bwa_open_reads(opt.mode, fn_fa); + // core loop + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + // read alignment + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + int n_aln; + err_fread_noeof(&n_aln, 4, 1, fp_sa); + if (n_aln > m_aln) { + m_aln = n_aln; + aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + } + err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); + } + + fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); + bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); + bwa_refine_gapped(bns, n_seqs, seqs, 0); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) + bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwa_seq_close(ks); + bns_destroy(bns); + err_fclose(fp_sa); + free(aln); +} + +int bwa_sai2sam_se(int argc, char *argv[]) +{ + int c, n_occ = 3; + char *prefix, *rg_line = 0; + while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { + switch (c) { + case 'h': break; + case 'r': + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; + break; + case 'n': n_occ = atoi(optarg); break; + case 'f': xreopen(optarg, "w", stdout); break; + default: return 1; + } + } + + if (optind + 3 > argc) { + fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); + return 1; + } + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 1; + } + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); + free(prefix); + return 0; +} diff --git a/src/bwa/bwase.h b/src/bwa/bwase.h new file mode 100644 index 000000000..26a9f68c7 --- /dev/null +++ b/src/bwa/bwase.h @@ -0,0 +1,29 @@ +#ifndef BWASE_H +#define BWASE_H + +#include "bntseq.h" +#include "bwt.h" +#include "bwtaln.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // Initialize mapping tables in the bwa single-end mapper. + void bwase_initialize(); + // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. + void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); + // Refine the approximate position of the sequence to an actual placement for the sequence. + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); + // Backfill certain alignment properties mainly centering around number of matches. + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + // Calculate the end position of a read given a certain sequence. + int64_t pos_end(const bwa_seq_t *p); + // + bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); + +#ifdef __cplusplus +} +#endif + +#endif // BWASE_H diff --git a/src/bwa/bwaseqio.c b/src/bwa/bwaseqio.c new file mode 100644 index 000000000..d850307cd --- /dev/null +++ b/src/bwa/bwaseqio.c @@ -0,0 +1,235 @@ +#include +#include +#include "bwtaln.h" +#include "utils.h" +#include "bamlite.h" + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +extern unsigned char nst_nt4_table[256]; +static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +struct __bwa_seqio_t { + // for BAM input + int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE + bamFile fp; + // for fastq input + kseq_t *ks; +}; + +bwa_seqio_t *bwa_bam_open(const char *fn, int which) +{ + bwa_seqio_t *bs; + bam_header_t *h; + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + bs->is_bam = 1; + bs->which = which; + bs->fp = bam_open(fn, "r"); + if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); + h = bam_header_read(bs->fp); + bam_header_destroy(h); + return bs; +} + +bwa_seqio_t *bwa_seq_open(const char *fn) +{ + gzFile fp; + bwa_seqio_t *bs; + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + fp = xzopen(fn, "r"); + bs->ks = kseq_init(fp); + return bs; +} + +void bwa_seq_close(bwa_seqio_t *bs) +{ + if (bs == 0) return; + if (bs->is_bam) { + if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); + } else { + err_gzclose(bs->ks->f->f); + kseq_destroy(bs->ks); + } + free(bs); +} + +void seq_reverse(int len, ubyte_t *seq, int is_comp) +{ + int i; + if (is_comp) { + for (i = 0; i < len>>1; ++i) { + char tmp = seq[len-1-i]; + if (tmp < 4) tmp = 3 - tmp; + seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; + seq[i] = tmp; + } + if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; + } else { + for (i = 0; i < len>>1; ++i) { + char tmp = seq[len-1-i]; + seq[len-1-i] = seq[i]; seq[i] = tmp; + } + } +} + +int bwa_trim_read(int trim_qual, bwa_seq_t *p) +{ + int s = 0, l, max = 0, max_l = p->len; + if (trim_qual < 1 || p->qual == 0) return 0; + for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) { + s += trim_qual - (p->qual[l] - 33); + if (s < 0) break; + if (s > max) max = s, max_l = l; + } + p->clip_len = p->len = max_l; + return p->full_len - p->len; +} + +static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) +{ + bwa_seq_t *seqs, *p; + int n_seqs, l, i; + long n_trimmed = 0, n_tot = 0; + bam1_t *b; + int res; + + b = bam_init1(); + n_seqs = 0; + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + while ((res = bam_read1(bs->fp, b)) >= 0) { + uint8_t *s, *q; + int go = 0; + if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; + if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; + if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; + if (go == 0) continue; + l = b->core.l_qseq; + p = &seqs[n_seqs++]; + p->tid = -1; // no assigned to a thread + p->qual = 0; + p->full_len = p->clip_len = p->len = l; + n_tot += p->full_len; + s = bam1_seq(b); q = bam1_qual(b); + p->seq = (ubyte_t*)calloc(p->len + 1, 1); + p->qual = (ubyte_t*)calloc(p->len + 1, 1); + for (i = 0; i != p->full_len; ++i) { + p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; + p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; + } + if (bam1_strand(b)) { // then reverse + seq_reverse(p->len, p->seq, 1); + seq_reverse(p->len, p->qual, 0); + } + if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); + p->rseq = (ubyte_t*)calloc(p->full_len, 1); + memcpy(p->rseq, p->seq, p->len); + seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() + seq_reverse(p->len, p->rseq, is_comp); + p->name = strdup((const char*)bam1_qname(b)); + if (n_seqs == n_needed) break; + } + if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); + *n = n_seqs; + if (n_seqs && trim_qual >= 1) + fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); + if (n_seqs == 0) { + free(seqs); + bam_destroy1(b); + return 0; + } + bam_destroy1(b); + return seqs; +} + +#define BARCODE_LOW_QUAL 13 + +bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) +{ + bwa_seq_t *seqs, *p; + kseq_t *seq = bs->ks; + int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; + long n_trimmed = 0, n_tot = 0; + + if (l_bc > BWA_MAX_BCLEN) { + fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); + return 0; + } + if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input + n_seqs = 0; + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + while ((l = kseq_read(seq)) >= 0) { + if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { + // skip reads that are marked to be filtered by Casava + char *s = index(seq->comment.s, ':'); + if (s && *(++s) == 'Y') { + continue; + } + } + if (is_64 && seq->qual.l) + for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; + if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length + p = &seqs[n_seqs++]; + if (l_bc) { // then trim barcode + for (i = 0; i < l_bc; ++i) + p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); + p->bc[i] = 0; + for (; i < seq->seq.l; ++i) + seq->seq.s[i - l_bc] = seq->seq.s[i]; + seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; + if (seq->qual.l) { + for (i = l_bc; i < seq->qual.l; ++i) + seq->qual.s[i - l_bc] = seq->qual.s[i]; + seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; + } + l = seq->seq.l; + } else p->bc[0] = 0; + p->tid = -1; // no assigned to a thread + p->qual = 0; + p->full_len = p->clip_len = p->len = l; + n_tot += p->full_len; + p->seq = (ubyte_t*)calloc(p->full_len, 1); + for (i = 0; i != p->full_len; ++i) + p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; + if (seq->qual.l) { // copy quality + p->qual = (ubyte_t*)strdup((char*)seq->qual.s); + if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); + } + p->rseq = (ubyte_t*)calloc(p->full_len, 1); + memcpy(p->rseq, p->seq, p->len); + seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() + seq_reverse(p->len, p->rseq, is_comp); + p->name = strdup((const char*)seq->name.s); + { // trim /[12]$ + int t = strlen(p->name); + if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; + } + if (n_seqs == n_needed) break; + } + *n = n_seqs; + if (n_seqs && trim_qual >= 1) + fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); + if (n_seqs == 0) { + free(seqs); + return 0; + } + return seqs; +} + +void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) +{ + int i, j; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + for (j = 0; j < p->n_multi; ++j) + if (p->multi[j].cigar) free(p->multi[j].cigar); + free(p->name); + free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); + free(p->cigar); + } + free(seqs); +} diff --git a/src/bwa/bwashm.c b/src/bwa/bwashm.c new file mode 100644 index 000000000..163f76456 --- /dev/null +++ b/src/bwa/bwashm.c @@ -0,0 +1,213 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bwa.h" + +int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn) +{ + const char *name; + uint8_t *shm, *shm_idx; + uint16_t *cnt; + int shmid, to_init = 0, l; + char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn; + + if (hint == 0 || hint[0] == 0) return -1; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + + if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) { + shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644); + to_init = 1; + } + if (shmid < 0) return -1; + ftruncate(shmid, BWA_CTL_SIZE); + shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + if (to_init) { + memset(shm, 0, BWA_CTL_SIZE); + cnt[1] = 4; + } + + if (idx->mem == 0) bwa_idx2mem(idx); + + if (tmpfn) { + FILE *fp; + if ((fp = fopen(tmpfn, "wb")) != 0) { + int64_t rest = idx->l_mem; + while (rest > 0) { + int64_t l = rest < 0x1000000? rest : 0x1000000; + rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp); + } + fclose(fp); + free(idx->mem); idx->mem = 0; + } else { + fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__); + tmpfn = 0; + } + } + + strcat(strcpy(path, "/bwaidx-"), name); + if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) { + shm_unlink(path); + perror("shm_open()"); + return -1; + } + l = 8 + strlen(name) + 1; + if (cnt[1] + l > BWA_CTL_SIZE) return -1; + memcpy(shm + cnt[1], &idx->l_mem, 8); + memcpy(shm + cnt[1] + 8, name, l - 8); + cnt[1] += l; ++cnt[0]; + ftruncate(shmid, idx->l_mem); + shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); + if (tmpfn) { + FILE *fp; + fp = fopen(tmpfn, "rb"); + int64_t rest = idx->l_mem; + while (rest > 0) { + int64_t l = rest < 0x1000000? rest : 0x1000000; + rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp); + } + fclose(fp); + unlink(tmpfn); + } else { + memcpy(shm_idx, idx->mem, idx->l_mem); + free(idx->mem); + } + bwa_mem2idx(idx->l_mem, shm_idx, idx); + idx->is_shm = 1; + return 0; +} + +bwaidx_t *bwa_idx_load_from_shm(const char *hint) +{ + const char *name; + uint8_t *shm, *shm_idx; + uint16_t *cnt, i; + char *p, path[PATH_MAX + 1]; + int shmid; + int64_t l_mem; + bwaidx_t *idx; + + if (hint == 0 || hint[0] == 0) return 0; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + if (cnt[0] == 0) return 0; + for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) { + memcpy(&l_mem, p, 8); p += 8; + if (strcmp(p, name) == 0) break; + p += strlen(p) + 1; + } + if (i == cnt[0]) return 0; + + strcat(strcpy(path, "/bwaidx-"), name); + if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0; + shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0); + idx = calloc(1, sizeof(bwaidx_t)); + bwa_mem2idx(l_mem, shm_idx, idx); + idx->is_shm = 1; + return idx; +} + +int bwa_shm_test(const char *hint) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + const char *name; + + if (hint == 0 || hint[0] == 0) return 0; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + if (strcmp(p + 8, name) == 0) return 1; + p += strlen(p) + 9; + } + return 0; +} + +int bwa_shm_list(void) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + int64_t l_mem; + memcpy(&l_mem, p, 8); p += 8; + printf("%s\t%ld\n", p, (long)l_mem); + p += strlen(p) + 1; + } + return 0; +} + +int bwa_shm_destroy(void) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + char path[PATH_MAX + 1]; + + if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + int64_t l_mem; + memcpy(&l_mem, p, 8); p += 8; + strcat(strcpy(path, "/bwaidx-"), p); + shm_unlink(path); + p += strlen(p) + 1; + } + munmap(shm, BWA_CTL_SIZE); + shm_unlink("/bwactl"); + return 0; +} + +int main_shm(int argc, char *argv[]) +{ + int c, to_list = 0, to_drop = 0, ret = 0; + char *tmpfn = 0; + while ((c = getopt(argc, argv, "ldf:")) >= 0) { + if (c == 'l') to_list = 1; + else if (c == 'd') to_drop = 1; + else if (c == 'f') tmpfn = optarg; + } + if (optind == argc && !to_list && !to_drop) { + fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n"); + fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); + fprintf(stderr, " -l list names of indices in shared memory\n"); + fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n"); + return 1; + } + if (optind < argc && (to_list || to_drop)) { + fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__); + return 1; + } + if (optind < argc) { + if (bwa_shm_test(argv[optind]) == 0) { + bwaidx_t *idx; + idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL); + if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) { + fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__); + ret = 1; + } + bwa_idx_destroy(idx); + } else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]); + } + if (to_list) bwa_shm_list(); + if (to_drop) bwa_shm_destroy(); + return ret; +} diff --git a/src/bwa/bwt.c b/src/bwa/bwt.c new file mode 100644 index 000000000..e5b4da8c1 --- /dev/null +++ b/src/bwa/bwt.c @@ -0,0 +1,470 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "bwt.h" +#include "kvec.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +void bwt_gen_cnt_table(bwt_t *bwt) +{ + int i, j; + for (i = 0; i != 256; ++i) { + uint32_t x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + bwt->cnt_table[i] = x; + } +} + +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA +{ + bwtint_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} + +// bwt->bwt and bwt->occ must be precalculated +void bwt_cal_sa(bwt_t *bwt, int intv) +{ + bwtint_t isa, sa, i; // S(isa) = sa + int intv_round = intv; + + kv_roundup32(intv_round); + xassert(intv_round == intv, "SA sample interval is not a power of 2."); + xassert(bwt->bwt, "bwt_t::bwt is not initialized."); + + if (bwt->sa) free(bwt->sa); + bwt->sa_intv = intv; + bwt->n_sa = (bwt->seq_len + intv) / intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + // calculate SA value + isa = 0; sa = bwt->seq_len; + for (i = 0; i < bwt->seq_len; ++i) { + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + --sa; + isa = bwt_invPsi(bwt, isa); + } + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len +} + +bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) +{ + bwtint_t sa = 0, mask = bwt->sa_intv - 1; + while (k & mask) { + ++sa; + k = bwt_invPsi(bwt, k); + } + /* without setting bwt->sa[0] = -1, the following line should be + changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ + return sa + bwt->sa[k/bwt->sa_intv]; +} + +static inline int __occ_aux(uint64_t y, int c) +{ + // reduce nucleotide counting to bits counting + y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; + // count the number of 1s in y + y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); + return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; +} + +bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) +{ + bwtint_t n; + uint32_t *p, *end; + + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (bwtint_t)(-1)) return 0; + k -= (k >= bwt->primary); // because $ is not in bwt + + // retrieve Occ at k/OCC_INTERVAL + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); // jump to the start of the first BWT cell + + // calculate Occ up to the last k/32 + end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); + for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + + // calculate Occ + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + + return n; +} + +// an analogy to bwt_occ() but more efficient, requiring k <= l +void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) +{ + bwtint_t _k, _l; + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + *ok = bwt_occ(bwt, k, c); + *ol = bwt_occ(bwt, l, c); + } else { + bwtint_t m, n, i, j; + uint32_t *p; + if (k >= bwt->primary) --k; + if (l >= bwt->primary) --l; + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); + // calculate *ok + j = k >> 5 << 5; + for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m = n; + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + *ok = n; + // calculate *ol + j = l >> 5 << 5; + for (; i < j; i += 32, p += 2) + m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); + if (c == 0) m -= ~l&31; // corrected for the masked bits + *ol = m; + } +} + +#define __occ_aux4(bwt, b) \ + ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) + +void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +{ + bwtint_t x; + uint32_t *p, tmp, *end; + if (k == (bwtint_t)(-1)) { + memset(cnt, 0, 4 * sizeof(bwtint_t)); + return; + } + + k -= (k >= bwt->primary); // because $ is not in bwt + p = bwt_occ_intv(bwt, k); + memcpy(cnt, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop + for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} + +// an analogy to bwt_occ4() but more efficient, requiring k <= l +void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +{ + bwtint_t _k, _l; + _k = k - (k >= bwt->primary); + _l = l - (l >= bwt->primary); + if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + bwt_occ4(bwt, k, cntk); + bwt_occ4(bwt, l, cntl); + } else { + bwtint_t x, y; + uint32_t *p, tmp, *endk, *endl; + k -= (k >= bwt->primary); // because $ is not in bwt + l -= (l >= bwt->primary); + p = bwt_occ_intv(bwt, k); + memcpy(cntk, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + // prepare cntk[] + endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); + endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); + for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); + y = x; + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); + // calculate cntl[] and finalize cntk[] + for (; p < endl; ++p) y += __occ_aux4(bwt, *p); + tmp = *p & ~((1U<<((~l&15)<<1)) - 1); + y += __occ_aux4(bwt, tmp) - (~l&15); + memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); + cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; + cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; + } +} + +int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) +{ + bwtint_t k, l, ok, ol; + int i; + k = 0; l = bwt->seq_len; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) break; // no match + } + if (k > l) return 0; // no match + if (sa_begin) *sa_begin = k; + if (sa_end) *sa_end = l; + return l - k + 1; +} + +int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) +{ + int i; + bwtint_t k, l, ok, ol; + k = *k0; l = *l0; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // there is an N here. no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) return 0; // no match + } + *k0 = k; *l0 = l; + return l - k + 1; +} + +/********************* + * Bidirectional BWT * + *********************/ + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + for (i = 0; i != 4; ++i) { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; + ok[i].x[2] = tl[i] - tk[i]; + } + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; +} + +static void bwt_reverse_intvs(bwtintv_v *p) +{ + if (p->n > 1) { + int j; + for (j = 0; j < p->n>>1; ++j) { + bwtintv_t tmp = p->a[p->n - 1 - j]; + p->a[p->n - 1 - j] = p->a[j]; + p->a[j] = tmp; + } + } +} +// NOTE: $max_intv is not currently used in BWA-MEM +int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + int i, j, c, ret; + bwtintv_t ik, ok[4]; + bwtintv_v a[2], *prev, *curr, *swap; + + mem->n = 0; + if (q[x] > 3) return x + 1; + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 + kv_init(a[0]); kv_init(a[1]); + prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base + ik.info = x + 1; + + for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search + if (ik.x[2] < max_intv) { // an interval small enough + kv_push(bwtintv_t, *curr, ik); + break; + } else if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) { // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + } + ik = ok[c]; ik.info = i + 1; + } else { // an ambiguous base + kv_push(bwtintv_t, *curr, ik); + break; // always terminate extension at an ambiguous base; in this case, ia[0].info; // this will be the returned value + swap = curr; curr = prev; prev = swap; + + for (i = x - 1; i >= -1; --i) { // backward search for MEMs + c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base + for (j = 0, curr->n = 0; j < prev->n; ++j) { + bwtintv_t *p = &prev->a[j]; + if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1); + if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches + if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches + ik = *p; ik.info |= (uint64_t)(i + 1)<<32; + kv_push(bwtintv_t, *mem, ik); + } + } // otherwise the match is contained in another longer match + } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { + ok[c].info = p->info; + kv_push(bwtintv_t, *curr, ok[c]); + } + } + if (curr->n == 0) break; + swap = curr; curr = prev; prev = swap; + } + bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate + + if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); + if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); + return ret; +} + +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); +} + +int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) +{ + int i, c; + bwtintv_t ik, ok[4]; + + memset(mem, 0, sizeof(bwtintv_t)); + if (q[x] > 3) return x + 1; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base + for (i = x + 1; i < len; ++i) { // forward search + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] < max_intv && i - x >= min_len) { + *mem = ok[c]; + mem->info = (uint64_t)x<<32 | (i + 1); + return i + 1; + } + ik = ok[c]; + } else return i + 1; + } + return len; +} + +/************************* + * Read/write BWT and SA * + *************************/ + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + err_fflush(fp); + err_fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fflush(fp); + err_fclose(fp); +} + +static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) +{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks + const int bufsize = 0x1000000; // 16M block + bwtint_t offset = 0; + while (size) { + int x = bufsize < size? bufsize : size; + if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break; + size -= x; offset += x; + } + return offset; +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip + err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); + err_fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + err_fseek(fp, 0, SEEK_END); + bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); + err_fseek(fp, 0, SEEK_SET); + err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); + bwt->seq_len = bwt->L2[4]; + err_fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); +} diff --git a/src/bwa/bwt.h b/src/bwa/bwt.h new file mode 100644 index 000000000..c71d6b5e1 --- /dev/null +++ b/src/bwa/bwt.h @@ -0,0 +1,130 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWA_BWT_H +#define BWA_BWT_H + +#include +#include + +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 +#define OCC_INTV_SHIFT 7 +#define OCC_INTERVAL (1LL<bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) +#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) +*/ + +// The following two lines are ONLY correct when OCC_INTERVAL==0x80 +#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)]) +#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4)) + +/* retrieve a character from the $-removed BWT string. Note that + * bwt_t::bwt is not exactly the BWT string and therefore this macro is + * called bwt_B0 instead of bwt_B */ +#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) + +#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) + +#ifdef __cplusplus +extern "C" { +#endif + + void bwt_dump_bwt(const char *fn, const bwt_t *bwt); + void bwt_dump_sa(const char *fn, const bwt_t *bwt); + + bwt_t *bwt_restore_bwt(const char *fn); + void bwt_restore_sa(const char *fn, bwt_t *bwt); + + void bwt_destroy(bwt_t *bwt); + + void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW + void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW + void bwt_cal_sa(bwt_t *bwt, int intv); + + void bwt_bwtupdate_core(bwt_t *bwt); + + bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); + void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); + bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); + + // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values + void bwt_gen_cnt_table(bwt_t *bwt); + void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); + void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); + + int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); + int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + + /** + * Extend bi-SA-interval _ik_ + */ + void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); + + /** + * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. + * Return the end of the longest exact match starting from _x_. + */ + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + + int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bwt_gen.c b/src/bwa/bwt_gen.c new file mode 100644 index 000000000..76f28c991 --- /dev/null +++ b/src/bwa/bwt_gen.c @@ -0,0 +1,1632 @@ +/* + + BWTConstruct.c BWT-Index Construction + + This module constructs BWT and auxiliary data structures. + + Copyright (C) 2004, Wong Chi Kwong. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +*/ + +#include +#include +#include +#include +#include +#include +#include "QSufSort.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef uint64_t bgint_t; +typedef int64_t sbgint_t; + +#define ALPHABET_SIZE 4 +#define BIT_PER_CHAR 2 +#define CHAR_PER_WORD 16 +#define CHAR_PER_BYTE 4 + +#define BITS_IN_WORD 32 +#define BITS_IN_BYTE 8 +#define BYTES_IN_WORD 4 + +#define ALL_ONE_MASK 0xFFFFFFFF +#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 + +#define BITS_PER_OCC_VALUE 16 +#define OCC_VALUE_PER_WORD 2 +#define OCC_INTERVAL 256 +#define OCC_INTERVAL_MAJOR 65536 + +#define TRUE 1 +#define FALSE 0 + +#define BWTINC_INSERT_SORT_NUM_ITEM 7 + +#define MIN_AVAILABLE_WORD 0x10000 + +#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; +#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) +#define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) +#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) + +typedef struct BWT { + bgint_t textLength; // length of the text + bgint_t inverseSa0; // SA-1[0] + bgint_t *cumulativeFreq; // cumulative frequency + unsigned int *bwtCode; // BWT code + unsigned int *occValue; // Occurrence values stored explicitly + bgint_t *occValueMajor; // Occurrence values stored explicitly + unsigned int *decodeTable; // For decoding BWT by table lookup + bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated +} BWT; + +typedef struct BWTInc { + BWT *bwt; + unsigned int numberOfIterationDone; + bgint_t *cumulativeCountInCurrentBuild; + bgint_t availableWord; + bgint_t buildSize; + bgint_t initialMaxBuildSize; + bgint_t incMaxBuildSize; + unsigned int firstCharInLastIteration; + unsigned int *workingMemory; + unsigned int *packedText; + unsigned char *textBuffer; + unsigned int *packedShift; +} BWTInc; + +static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, + unsigned int lastByteLength) +{ + return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; +} + +static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) +{ + bgint_t i; + for (i=0; i>= 2; + } + } + +} +// for BWTIncCreate() +static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar) +{ + bgint_t numOfOccValue; + unsigned numOfOccIntervalPerMajor; + numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; + return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; +} +// for BWTIncCreate() +static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar) +{ + bgint_t numOfOccValue; + numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; +} +// for BWTIncCreate() +static bgint_t BWTResidentSizeInWord(const bgint_t numChar) { + + bgint_t numCharRoundUpToOccInterval; + + // The $ in BWT at the position of inverseSa0 is not encoded + numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; + + return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; + +} + +static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) +{ + bgint_t maxBuildSize; + + if (bwtInc->bwt->textLength == 0) { + // initial build + // Minus 2 because n+1 entries of seq and rank needed for n char + maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4)) + / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); + if (bwtInc->initialMaxBuildSize > 0) { + bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); + } else { + bwtInc->buildSize = maxBuildSize; + } + } else { + // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char + // Minus numberOfIterationDone because bwt slightly shift to left in each iteration + maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord + - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) + / 3 / (sizeof(bgint_t) / 4); + if (maxBuildSize < CHAR_PER_WORD) { + fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); + exit(1); + } + if (bwtInc->incMaxBuildSize > 0) { + bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); + } else { + bwtInc->buildSize = maxBuildSize; + } + if (bwtInc->buildSize < CHAR_PER_WORD) + bwtInc->buildSize = CHAR_PER_WORD; + } + + if (bwtInc->buildSize < CHAR_PER_WORD) { + fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); + exit(1); + } + + bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; + + bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); + bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); +} + +// for ceilLog2() +unsigned int leadingZero(const unsigned int input) +{ + unsigned int l; + const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + if (input & 0xFFFF0000) { + if (input & 0xFF000000) { + l = leadingZero8bit[input >> 24]; + } else { + l = 8 + leadingZero8bit[input >> 16]; + } + } else { + if (input & 0x0000FF00) { + l = 16 + leadingZero8bit[input >> 8]; + } else { + l = 24 + leadingZero8bit[input]; + } + } + return l; + +} +// for BitPerBytePackedChar() +static unsigned int ceilLog2(const unsigned int input) +{ + if (input <= 1) return 0; + return BITS_IN_WORD - leadingZero(input - 1); + +} +// for ConvertBytePackedToWordPacked() +static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) +{ + unsigned int bitPerChar; + bitPerChar = ceilLog2(alphabetSize); + // Return the largest number of bit that does not affect packing efficiency + if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) + bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); + return bitPerChar; +} +// for ConvertBytePackedToWordPacked() +static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) +{ + return ceilLog2(alphabetSize); +} + +static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, + const bgint_t textLength) +{ + bgint_t i; + unsigned int j, k, c; + unsigned int bitPerBytePackedChar; + unsigned int bitPerWordPackedChar; + unsigned int charPerWord; + unsigned int charPerByte; + unsigned int bytePerIteration; + bgint_t byteProcessed = 0; + bgint_t wordProcessed = 0; + unsigned int mask, shift; + + unsigned int buffer[BITS_IN_WORD]; + + bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); + bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); + charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; + charPerWord = BITS_IN_WORD / bitPerWordPackedChar; + + bytePerIteration = charPerWord / charPerByte; + mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); + shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; + + while ((wordProcessed + 1) * charPerWord < textLength) { + + k = 0; + for (i=0; i> bitPerWordPackedChar * i; + } + output[wordProcessed] = c; + wordProcessed++; + + } + + k = 0; + for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { + c = (unsigned int)input[byteProcessed] << shift; + for (j=0; j> bitPerWordPackedChar * i; + } + output[wordProcessed] = c; +} + +BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) +{ + BWT *bwt; + + bwt = (BWT*)calloc(1, sizeof(BWT)); + + bwt->textLength = 0; + + bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); + + bwt->bwtSizeInWord = 0; + + // Generate decode tables + if (decodeTable == NULL) { + bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); + GenerateDNAOccCountTable(bwt->decodeTable); + } else { + bwt->decodeTable = decodeTable; + } + + bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); + bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); + + bwt->occSizeInWord = 0; + bwt->occValue = NULL; + + return bwt; +} + +BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize) +{ + BWTInc *bwtInc; + unsigned int i, n_iter; + + if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; + if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; + + bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); + bwtInc->numberOfIterationDone = 0; + bwtInc->bwt = BWTCreate(textLength, NULL); + bwtInc->initialMaxBuildSize = initialMaxBuildSize; + bwtInc->incMaxBuildSize = incMaxBuildSize; + bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + + // Build frequently accessed data + bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); + for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; + + n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1; + bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement + + OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array + + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration + if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small + fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); + bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); + + return bwtInc; +} +// for BWTIncConstruct() +static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, + bgint_t* __restrict cumulativeCount, const bgint_t numChar) +{ + bgint_t i; + unsigned int j; + unsigned int c, t; + unsigned int packedMask; + bgint_t rankIndex; + bgint_t lastWord; + unsigned int numCharInLastWord; + + lastWord = (numChar - 1) / CHAR_PER_WORD; + numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; + + packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); + rankIndex = numChar - 1; + + t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); + for (i=0; i>= BIT_PER_CHAR; + } + + for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 + t = packedText[i]; + for (j=0; j>= BIT_PER_CHAR; + } + } + + // Convert occurrence to cumulativeCount + cumulativeCount[2] += cumulativeCount[1]; + cumulativeCount[3] += cumulativeCount[2]; + cumulativeCount[4] += cumulativeCount[3]; +} + + +static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, + bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, + 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, + 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, + 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; + + bgint_t iteration, i; + unsigned int wordToCount, charToCount; + unsigned int j, c, sum; + + occCount[0] = 0; + occCount[1] = 0; + occCount[2] = 0; + occCount[3] = 0; + + iteration = index / 256; + wordToCount = (index - iteration * 256) / 16; + charToCount = index - iteration * 256 - wordToCount * 16; + + for (i=0; i> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + dna++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + occCount[0] += sum & 0x000000FF; sum >>= 8; + occCount[1] += sum & 0x000000FF; sum >>= 8; + occCount[2] += sum & 0x000000FF; sum >>= 8; + occCount[3] += sum; + } else { + // only some or all of the 3 bits are on + // in reality, only one of the four cases are possible + if (sum == 0x00000100) { + occCount[0] += 256; + } else if (sum == 0x00010000) { + occCount[1] += 256; + } else if (sum == 0x01000000) { + occCount[2] += 256; + } else if (sum == 0x00000000) { + occCount[3] += 256; + } else { + fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); + exit(1); + } + } + + } + + sum = 0; + for (j=0; j> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + dna++; + } + + if (charToCount > 0) { + c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + occCount[0] += sum & 0x000000FF; sum >>= 8; + occCount[1] += sum & 0x000000FF; sum >>= 8; + occCount[2] += sum & 0x000000FF; sum >>= 8; + occCount[3] += sum; +} + +static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, + const bgint_t *cumulativeCount, const unsigned int *packedShift) { + + bgint_t i, r; + unsigned int c; + bgint_t previousRank, currentRank; + bgint_t wordIndex, charIndex; + bgint_t inverseSa0; + + inverseSa0 = previousRank = relativeRank[0]; + + for (i=1; i<=numChar; i++) { + currentRank = relativeRank[i]; + // previousRank > cumulativeCount[c] because $ is one of the char + c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) + + (previousRank > cumulativeCount[3]); + // set bwt for currentRank + if (c > 0) { + // c <> 'a' + r = currentRank; + if (r > inverseSa0) { + // - 1 because $ at inverseSa0 is not encoded + r--; + } + wordIndex = r / CHAR_PER_WORD; + charIndex = r - wordIndex * CHAR_PER_WORD; + bwt[wordIndex] |= c << packedShift[charIndex]; + } + previousRank = currentRank; + } +} + +static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, + const unsigned int character) +{ + bgint_t occIndexMajor; + + occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; + + if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { + return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); + + } else { + return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); + } +} + + +static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, + const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, + 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, + 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, + 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; + + unsigned int wordToCount, charToCount; + unsigned int i, c; + unsigned int sum = 0; + + wordToCount = index / 16; + charToCount = index - wordToCount * 16; + + for (i=0; i> 16]; + sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; + } + + if (charToCount > 0) { + c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + return (sum >> (character * 8)) & 0x000000FF; + +} + +static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, + const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, + 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, + 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, + 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; + + unsigned int wordToCount, charToCount; + unsigned int i, c; + unsigned int sum = 0; + + wordToCount = index / 16; + charToCount = index - wordToCount * 16; + + dna -= wordToCount + 1; + + if (charToCount > 0) { + c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + for (i=0; i> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + } + + return (sum >> (character * 8)) & 0x000000FF; + +} + +bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) +{ + bgint_t occValue; + bgint_t occExplicitIndex, occIndex; + + // $ is supposed to be positioned at inverseSa0 but it is not encoded + // therefore index is subtracted by 1 for adjustment + if (index > bwt->inverseSa0) + index--; + + occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding + occIndex = occExplicitIndex * OCC_INTERVAL; + occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); + + if (occIndex == index) + return occValue; + + if (occIndex < index) { + return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); + } else { + return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); + } +} + +static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, + const unsigned int *packedText, const bgint_t numChar, + const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) +{ + bgint_t saIndex; + bgint_t lastWord; + unsigned int packedMask; + bgint_t i; + unsigned int c, t, j; + bgint_t rankIndex; + unsigned int shift; + bgint_t seqIndexFromStart[ALPHABET_SIZE]; + bgint_t seqIndexFromEnd[ALPHABET_SIZE]; + + for (i=0; i> shift; + saIndex = bwt->inverseSa0; + rankIndex = numChar - 1; + + lastWord = numChar / CHAR_PER_WORD; + for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 + t = packedText[i]; + for (j=0; jcumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; + // A counting sort using the first character of suffix is done here + // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 + if (saIndex > bwt->inverseSa0) { + seq[seqIndexFromEnd[c]] = rankIndex; + absoluteRank[seqIndexFromEnd[c]] = saIndex; + seqIndexFromEnd[c]--; + } else { + seq[seqIndexFromStart[c]] = rankIndex; + absoluteRank[seqIndexFromStart[c]] = saIndex; + seqIndexFromStart[c]++; + } + rankIndex--; + t >>= BIT_PER_CHAR; + } + } + + absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters + seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; + + return seqIndexFromStart[firstCharInLastIteration]; +} + +static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem) +{ + #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD + + int64_t lowIndex, highIndex, midIndex; + int64_t lowPartitionIndex, highPartitionIndex; + int64_t lowStack[32], highStack[32]; + int stackDepth; + int64_t i, j; + bgint_t tempSeq, tempKey; + int64_t numberOfEqualKey; + + if (numItem < 2) return; + + stackDepth = 0; + + lowIndex = 0; + highIndex = numItem - 1; + + for (;;) { + + for (;;) { + + // Sort small array of data + if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays + for (i=lowIndex+1; i<=highIndex; i++) { + tempSeq = seq[i]; + tempKey = key[i]; + for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { + seq[j] = seq[j-1]; + key[j] = key[j-1]; + } + if (j != i) { + seq[j] = tempSeq; + key[j] = tempKey; + } + } + break; + } + + // Choose pivot as median of the lowest, middle, and highest data; sort the three data + + midIndex = average(lowIndex, highIndex); + if (key[lowIndex] > key[midIndex]) { + tempSeq = seq[lowIndex]; + tempKey = key[lowIndex]; + seq[lowIndex] = seq[midIndex]; + key[lowIndex] = key[midIndex]; + seq[midIndex] = tempSeq; + key[midIndex] = tempKey; + } + if (key[lowIndex] > key[highIndex]) { + tempSeq = seq[lowIndex]; + tempKey = key[lowIndex]; + seq[lowIndex] = seq[highIndex]; + key[lowIndex] = key[highIndex]; + seq[highIndex] = tempSeq; + key[highIndex] = tempKey; + } + if (key[midIndex] > key[highIndex]) { + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[highIndex]; + key[midIndex] = key[highIndex]; + seq[highIndex] = tempSeq; + key[highIndex] = tempKey; + } + + // Partition data + + numberOfEqualKey = 0; + + lowPartitionIndex = lowIndex + 1; + highPartitionIndex = highIndex - 1; + + for (;;) { + while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { + numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); + lowPartitionIndex++; + } + while (lowPartitionIndex < highPartitionIndex) { + if (key[midIndex] >= key[highPartitionIndex]) { + numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); + break; + } + highPartitionIndex--; + } + if (lowPartitionIndex >= highPartitionIndex) { + break; + } + tempSeq = seq[lowPartitionIndex]; + tempKey = key[lowPartitionIndex]; + seq[lowPartitionIndex] = seq[highPartitionIndex]; + key[lowPartitionIndex] = key[highPartitionIndex]; + seq[highPartitionIndex] = tempSeq; + key[highPartitionIndex] = tempKey; + if (highPartitionIndex == midIndex) { + // partition key has been moved + midIndex = lowPartitionIndex; + } + lowPartitionIndex++; + highPartitionIndex--; + } + + // Adjust the partition index + highPartitionIndex = lowPartitionIndex; + lowPartitionIndex--; + + // move the partition key to end of low partition + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[lowPartitionIndex]; + key[midIndex] = key[lowPartitionIndex]; + seq[lowPartitionIndex] = tempSeq; + key[lowPartitionIndex] = tempKey; + + if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { + + // Many keys = partition key; separate the equal key data from the lower partition + + midIndex = lowIndex; + + for (;;) { + while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { + midIndex++; + } + while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { + lowPartitionIndex--; + } + if (midIndex >= lowPartitionIndex) { + break; + } + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[lowPartitionIndex - 1]; + key[midIndex] = key[lowPartitionIndex - 1]; + seq[lowPartitionIndex - 1] = tempSeq; + key[lowPartitionIndex - 1] = tempKey; + midIndex++; + lowPartitionIndex--; + } + + } + + if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { + // put the larger partition to stack + lowStack[stackDepth] = lowIndex; + highStack[stackDepth] = lowPartitionIndex - 1; + stackDepth++; + // sort the smaller partition first + lowIndex = highPartitionIndex; + } else { + // put the larger partition to stack + lowStack[stackDepth] = highPartitionIndex; + highStack[stackDepth] = highIndex; + stackDepth++; + // sort the smaller partition first + if (lowPartitionIndex > lowIndex) { + highIndex = lowPartitionIndex - 1; + } else { + // all keys in the partition equals to the partition key + break; + } + } + continue; + } + + // Pop a range from stack + if (stackDepth > 0) { + stackDepth--; + lowIndex = lowStack[stackDepth]; + highIndex = highStack[stackDepth]; + continue; + } else return; + } +} + + +static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq, + bgint_t* __restrict relativeRank, const bgint_t numItem, + bgint_t oldInverseSa0, const bgint_t *cumulativeCount) +{ + bgint_t i, c; + bgint_t s, r; + bgint_t lastRank, lastIndex; + bgint_t oldInverseSa0RelativeRank = 0; + bgint_t freq; + + lastIndex = numItem; + lastRank = sortedRank[numItem]; + if (lastRank > oldInverseSa0) { + sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt + } + s = seq[numItem]; + relativeRank[s] = numItem; + if (lastRank == oldInverseSa0) { + oldInverseSa0RelativeRank = numItem; + oldInverseSa0++; // so that this segment of code is not run again + lastRank++; // so that oldInverseSa0 become a sorted group with 1 item + } + + c = ALPHABET_SIZE - 1; + freq = cumulativeCount[c]; + + for (i=numItem; i--;) { // from numItem - 1 to 0 + r = sortedRank[i]; + if (r > oldInverseSa0) + sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt + s = seq[i]; + if (i < freq) { + if (lastIndex >= freq) + lastRank++; // to trigger the group across alphabet boundary to be split + c--; + freq = cumulativeCount[c]; + } + if (r == lastRank) { + relativeRank[s] = lastIndex; + } else { + if (i == lastIndex - 1) { + if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { + seq[lastIndex] = seq[lastIndex + 1] - 1; + } else { + seq[lastIndex] = (bgint_t)-1; + } + } + lastIndex = i; + lastRank = r; + relativeRank[s] = i; + if (r == oldInverseSa0) { + oldInverseSa0RelativeRank = i; + oldInverseSa0++; // so that this segment of code is not run again + lastRank++; // so that oldInverseSa0 become a sorted group with 1 item + } + } + } + +} + +static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, + const bgint_t *cumulativeCount) +{ + unsigned int c; + bgint_t i; + bgint_t previousRank, currentRank; + + previousRank = relativeRank[0]; + + for (i=1; i<=numChar; i++) { + currentRank = relativeRank[i]; + c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + + (previousRank >= cumulativeCount[3]); + insertBwt[currentRank] = c; + previousRank = currentRank; + } +} + +static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, + unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) +{ + unsigned int bitsInWordMinusBitPerChar; + bgint_t leftShift, rightShift; + bgint_t o; + bgint_t oIndex, iIndex, mIndex; + bgint_t mWord, mChar, oWord, oChar; + bgint_t numInsert; + + bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; + + oIndex = 0; + iIndex = 0; + mIndex = 0; + + mWord = 0; + mChar = 0; + + mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration + + while (oIndex < numOldBwt) { + + // copy from insertBwt + while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { + if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 + mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); + mIndex++; + mChar++; + if (mChar == CHAR_PER_WORD) { + mChar = 0; + mWord++; + mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary + } + } + iIndex++; + } + + // Copy from oldBwt to mergedBwt + if (iIndex <= numInsertBwt) { + o = sortedRank[iIndex]; + } else { + o = numOldBwt; + } + numInsert = o - oIndex; + + oWord = oIndex / CHAR_PER_WORD; + oChar = oIndex - oWord * CHAR_PER_WORD; + if (oChar > mChar) { + leftShift = (oChar - mChar) * BIT_PER_CHAR; + rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; + mergedBwt[mWord] = mergedBwt[mWord] + | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) + | (oldBwt[oWord+1] >> rightShift); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); + oIndex += CHAR_PER_WORD; + } + } else if (oChar < mChar) { + rightShift = (mChar - oChar) * BIT_PER_CHAR; + leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; + mergedBwt[mWord] = mergedBwt[mWord] + | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); + oIndex += CHAR_PER_WORD; + } + } else { // oChar == mChar + mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = oldBwt[oWord]; + oIndex += CHAR_PER_WORD; + } + } + oIndex = o; + mIndex += numInsert; + + // Clear the trailing garbage in mergedBwt + mWord = mIndex / CHAR_PER_WORD; + mChar = mIndex - mWord * CHAR_PER_WORD; + if (mChar == 0) { + mergedBwt[mWord] = 0; + } else { + mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); + } + + } + + // copy from insertBwt + while (iIndex <= numInsertBwt) { + if (sortedRank[iIndex] != 0) { + mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); + mIndex++; + mChar++; + if (mChar == CHAR_PER_WORD) { + mChar = 0; + mWord++; + mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary + } + } + iIndex++; + } +} + +void BWTClearTrailingBwtCode(BWT *bwt) +{ + bgint_t bwtResidentSizeInWord; + bgint_t wordIndex, offset; + bgint_t i; + + bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); + + wordIndex = bwt->textLength / CHAR_PER_WORD; + offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; + if (offset > 0) { + bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); + } else { + if (wordIndex < bwtResidentSizeInWord) { + bwt->bwtCode[wordIndex] = 0; + } + } + + for (i=wordIndex+1; ibwtCode[i] = 0; + } +} + + +void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, + bgint_t* __restrict occValueMajor, + const bgint_t textLength, const unsigned int* decodeTable) +{ + bgint_t numberOfOccValueMajor, numberOfOccValue; + unsigned int wordBetweenOccValue; + bgint_t numberOfOccIntervalPerMajor; + unsigned int c; + bgint_t i, j; + bgint_t occMajorIndex; + bgint_t occIndex, bwtIndex; + bgint_t sum; // perhaps unsigned is big enough + bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; + + wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; + + // Calculate occValue + numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; + numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; + + tempOccValue0[0] = 0; + tempOccValue0[1] = 0; + tempOccValue0[2] = 0; + tempOccValue0[3] = 0; + occValueMajor[0] = 0; + occValueMajor[1] = 0; + occValueMajor[2] = 0; + occValueMajor[3] = 0; + + occIndex = 0; + bwtIndex = 0; + for (occMajorIndex=1; occMajorIndex> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + tempOccValue0[0] = tempOccValue1[0]; + tempOccValue0[1] = tempOccValue1[1]; + tempOccValue0[2] = tempOccValue1[2]; + tempOccValue0[3] = tempOccValue1[3]; + sum = 0; + + occIndex++; + + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue0[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue0[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue0[2] += 256; + } else { + tempOccValue0[3] += 256; + } + } + } + + occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; + occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; + occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; + occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; + tempOccValue0[0] = 0; + tempOccValue0[1] = 0; + tempOccValue0[2] = 0; + tempOccValue0[3] = 0; + + } + + while (occIndex < (numberOfOccValue-1)/2) { + sum = 0; + tempOccValue1[0] = tempOccValue0[0]; + tempOccValue1[1] = tempOccValue0[1]; + tempOccValue1[2] = tempOccValue0[2]; + tempOccValue1[3] = tempOccValue0[3]; + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + tempOccValue0[0] = tempOccValue1[0]; + tempOccValue0[1] = tempOccValue1[1]; + tempOccValue0[2] = tempOccValue1[2]; + tempOccValue0[3] = tempOccValue1[3]; + sum = 0; + occIndex++; + + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue0[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue0[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue0[2] += 256; + } else { + tempOccValue0[3] += 256; + } + } + } + + sum = 0; + tempOccValue1[0] = tempOccValue0[0]; + tempOccValue1[1] = tempOccValue0[1]; + tempOccValue1[2] = tempOccValue0[2]; + tempOccValue1[3] = tempOccValue0[3]; + + if (occIndex * 2 < numberOfOccValue - 1) { + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + } + + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + +} + +static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) +{ + unsigned int i; + bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; + unsigned int firstCharInThisIteration; + + bgint_t *relativeRank, *seq, *sortedRank; + unsigned int *insertBwt, *mergedBwt; + bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; + + mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); + mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); + + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + + if (bwtInc->bwt->textLength == 0) { // Initial build + + // Set address + seq = (bgint_t*)bwtInc->workingMemory; + relativeRank = seq + bwtInc->buildSize + 1; + // mergedBwt and packedTex may share memory + mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place + + assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText); + assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt); + + // ->packedText is not used any more and may be overwritten by mergedBwt + BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); + + firstCharInThisIteration = relativeRank[0]; + relativeRank[numChar] = 0; + + // Sort suffix + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); + newInverseSa0 = relativeRank[0]; + + // Clear BWT area + initializeVAL(insertBwt, mergedBwtSizeInWord, 0); + + // Build BWT + BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); + + // so that the cumulativeCount is not deducted + bwtInc->firstCharInLastIteration = ALPHABET_SIZE; + + } else { // Incremental build + // Set address + sortedRank = (bgint_t*)bwtInc->workingMemory; + seq = sortedRank + bwtInc->buildSize + 1; + insertBwt = (unsigned*)seq; // insertBwt and seq share memory + // relativeRank and ->packedText may share memory + relativeRank = seq + bwtInc->buildSize + 1; + + assert((void*)relativeRank <= (void*)bwtInc->packedText); + + // Store the first character of this iteration + firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); + + // Count occurrence of input text + ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); + // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration + bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; + bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; + bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; + bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; + + // Get rank of new suffix among processed suffix + // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group + // ->packedText is not used any more and will be overwritten by relativeRank + oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, + numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); + + // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) + for (i=0; icumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || + bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { + BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); + } else { + if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { + BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); + } + if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { + BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); + } + } + } + + // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt + // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) + BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); + assert(relativeRank[numChar] == oldInverseSa0RelativeRank); + + // Sort suffix + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); + + newInverseSa0RelativeRank = relativeRank[0]; + newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; + + sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt + + // Build BWT; seq is overwritten by insertBwt + BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); + + // Merge BWT; relativeRank may be overwritten by mergedBwt + mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord + - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging + assert(mergedBwt >= insertBwt + numChar); + BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); + } + + // Build auxiliary structure and update info and pointers in BWT + bwtInc->bwt->textLength += numChar; + bwtInc->bwt->bwtCode = mergedBwt; + bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; + bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; + assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); + + bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; + + BWTClearTrailingBwtCode(bwtInc->bwt); + BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, + bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); + + bwtInc->bwt->inverseSa0 = newInverseSa0; + + bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); + bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); + bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); + bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); + + bwtInc->firstCharInLastIteration = firstCharInThisIteration; + + // Set build size and text address for the next build + BWTIncSetBuildSizeAndTextAddr(bwtInc); + bwtInc->numberOfIterationDone++; + +} + +BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) +{ + + FILE *packedFile; + bgint_t packedFileLen; + bgint_t totalTextLength; + bgint_t textToLoad, textSizeInByte; + bgint_t processedTextLength; + unsigned char lastByteLength; + + BWTInc *bwtInc; + + packedFile = (FILE*)fopen(inputFileName, "rb"); + + if (packedFile == NULL) { + fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + + if (fseek(packedFile, -1, SEEK_END) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + packedFileLen = ftell(packedFile); + if (packedFileLen == -1) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); + + bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); + + BWTIncSetBuildSizeAndTextAddr(bwtInc); + + if (bwtInc->buildSize > totalTextLength) { + textToLoad = totalTextLength; + } else { + textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); + } + textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte + + if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + + ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); + BWTIncConstruct(bwtInc, textToLoad); + + processedTextLength = textToLoad; + + while (processedTextLength < totalTextLength) { + textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; + if (textToLoad > totalTextLength - processedTextLength) { + textToLoad = totalTextLength - processedTextLength; + } + textSizeInByte = textToLoad / CHAR_PER_BYTE; + if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); + BWTIncConstruct(bwtInc, textToLoad); + processedTextLength += textToLoad; + if (bwtInc->numberOfIterationDone % 10 == 0) { + fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", + (long)bwtInc->numberOfIterationDone, (long)processedTextLength); + } + } + return bwtInc; +} + +void BWTFree(BWT *bwt) +{ + if (bwt == 0) return; + free(bwt->cumulativeFreq); + free(bwt->bwtCode); + free(bwt->occValue); + free(bwt->occValueMajor); + free(bwt->decodeTable); + free(bwt); +} + +void BWTIncFree(BWTInc *bwtInc) +{ + if (bwtInc == 0) return; + free(bwtInc->bwt); + free(bwtInc->workingMemory); + free(bwtInc); +} + +static bgint_t BWTFileSizeInWord(const bgint_t numChar) +{ + // The $ in BWT at the position of inverseSa0 is not encoded + return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; +} + +void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) +{ + FILE *bwtFile; +/* FILE *occValueFile; */ + bgint_t bwtLength; + + bwtFile = (FILE*)fopen(bwtFileName, "wb"); + if (bwtFile == NULL) { + fprintf(stderr, + "BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n", + bwtFileName, strerror(errno)); + exit(1); + } + + bwtLength = BWTFileSizeInWord(bwt->textLength); + + if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1 + || fwrite(bwt->cumulativeFreq + 1, + sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE + || fwrite(bwt->bwtCode, + sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n", + bwtFileName, strerror(errno)); + exit(1); + } + if (fclose(bwtFile) != 0) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n", + bwtFileName, strerror(errno)); + exit(1); + } +} + +void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size) +{ + BWTInc *bwtInc; + bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size); + printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); + BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); + BWTIncFree(bwtInc); +} + +void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) +{ + bwt_bwtgen2(fn_pac, fn_bwt, 10000000); +} + +int bwt_bwtgen_main(int argc, char *argv[]) +{ + if (argc < 3) { + fprintf(stderr, "Usage: bwtgen \n"); + return 1; + } + bwt_bwtgen(argv[1], argv[2]); + return 0; +} + +#ifdef MAIN_BWT_GEN + +int main(int argc, char *argv[]) +{ + return bwt_bwtgen_main(argc, argv); +} + +#endif diff --git a/src/bwa/bwt_lite.c b/src/bwa/bwt_lite.c new file mode 100644 index 000000000..f7946f549 --- /dev/null +++ b/src/bwa/bwt_lite.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include "bwt_lite.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +int is_sa(const uint8_t *T, int *SA, int n); +int is_bwt(uint8_t *T, int n); + +bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) +{ + bwtl_t *b; + int i; + b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); + b->seq_len = len; + + { // calculate b->bwt + uint8_t *s; + b->sa = (uint32_t*)calloc(len + 1, 4); + is_sa(seq, (int*)b->sa, len); + s = (uint8_t*)calloc(len + 1, 1); + for (i = 0; i <= len; ++i) { + if (b->sa[i] == 0) b->primary = i; + else s[i] = seq[b->sa[i] - 1]; + } + for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; + b->bwt_size = (len + 15) / 16; + b->bwt = (uint32_t*)calloc(b->bwt_size, 4); + for (i = 0; i < len; ++i) + b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); + free(s); + } + { // calculate b->occ + uint32_t c[4]; + b->n_occ = (len + 15) / 16 * 4; + b->occ = (uint32_t*)calloc(b->n_occ, 4); + memset(c, 0, 16); + for (i = 0; i < len; ++i) { + if (i % 16 == 0) + memcpy(b->occ + (i/16) * 4, c, 16); + ++c[bwtl_B0(b, i)]; + } + memcpy(b->L2+1, c, 16); + for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1]; + } + { // generate cnt_table + for (i = 0; i != 256; ++i) { + u_int32_t j, x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + b->cnt_table[i] = x; + } + } + return b; +} +uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) +{ + uint32_t n, b; + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (uint32_t)(-1)) return 0; + if (k >= bwt->primary) --k; // because $ is not in bwt + n = bwt->occ[k/16<<2|c]; + b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1); + n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff; + if (c == 0) n -= 15 - (k&15); // corrected for the masked bits + return n; +} +void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) +{ + uint32_t x, b; + if (k == (uint32_t)(-1)) { + memset(cnt, 0, 16); + return; + } + if (k >= bwt->primary) --k; // because $ is not in bwt + memcpy(cnt, bwt->occ + (k>>4<<2), 16); + b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1); + x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]; + x -= 15 - (k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} +void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) +{ + bwtl_occ4(bwt, k, cntk); + bwtl_occ4(bwt, l, cntl); +} +void bwtl_destroy(bwtl_t *bwt) +{ + if (bwt) { + free(bwt->occ); free(bwt->bwt); free(bwt->sa); + free(bwt); + } +} diff --git a/src/bwa/bwt_lite.h b/src/bwa/bwt_lite.h new file mode 100644 index 000000000..4fadccedc --- /dev/null +++ b/src/bwa/bwt_lite.h @@ -0,0 +1,29 @@ +#ifndef BWT_LITE_H_ +#define BWT_LITE_H_ + +#include + +typedef struct { + uint32_t seq_len, bwt_size, n_occ; + uint32_t primary; + uint32_t *bwt, *occ, *sa, L2[5]; + uint32_t cnt_table[256]; +} bwtl_t; + +#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +#ifdef __cplusplus +extern "C" { +#endif + + bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); + uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); + void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); + void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); + void bwtl_destroy(bwtl_t *bwt); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bwtaln.c b/src/bwa/bwtaln.c new file mode 100644 index 000000000..20b01cd3e --- /dev/null +++ b/src/bwa/bwtaln.c @@ -0,0 +1,320 @@ +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "bwtaln.h" +#include "bwtgap.h" +#include "utils.h" +#include "bwa.h" + +#ifdef HAVE_PTHREAD +#include +#endif + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +gap_opt_t *gap_init_opt() +{ + gap_opt_t *o; + o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); + /* IMPORTANT: s_mm*10 should be about the average base error + rate. Voilating this requirement will break pairing! */ + o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; + o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; + o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; + o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; + o->seed_len = 32; o->max_seed_diff = 2; + o->fnr = 0.04; + o->n_threads = 1; + o->max_top2 = 30; + o->trim_qual = 0; + return o; +} + +int bwa_cal_maxdiff(int l, double err, double thres) +{ + double elambda = exp(-l * err); + double sum, y = 1.0; + int k, x = 1; + for (k = 1, sum = elambda; k < 1000; ++k) { + y *= l * err; + x *= k; + sum += elambda * y / x; + if (1.0 - sum < thres) return k; + } + return 2; +} + +// width must be filled as zero +int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) +{ + bwtint_t k, l, ok, ol; + int i, bid; + bid = 0; + k = 0; l = bwt->seq_len; + for (i = 0; i < len; ++i) { + ubyte_t c = str[i]; + if (c < 4) { + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + } + if (k > l || c > 3) { // then restart + k = 0; + l = bwt->seq_len; + ++bid; + } + width[i].w = l - k + 1; + width[i].bid = bid; + } + width[len].w = 0; + width[len].bid = ++bid; + return bid; +} + +void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) +{ + int i, j, max_l = 0, max_len; + gap_stack_t *stack; + bwt_width_t *w, *seed_w; + gap_opt_t local_opt = *opt; + + // initiate priority stack + for (i = max_len = 0; i != n_seqs; ++i) + if (seqs[i].len > max_len) max_len = seqs[i].len; + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); + if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; + stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); + + seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; +#ifdef HAVE_PTHREAD + if (i % opt->n_threads != tid) continue; +#endif + p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; + if (max_l < p->len) { + max_l = p->len; + w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); + memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); + } + bwt_cal_width(bwt, p->len, p->seq, w); + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); + local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; + if (p->len > opt->seed_len) + bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); + // core function + for (j = 0; j < p->len; ++j) // we need to complement + p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; + p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo); + // clean up the unused data in the record + free(p->name); free(p->seq); free(p->rseq); free(p->qual); + p->name = 0; p->seq = p->rseq = p->qual = 0; + } + free(seed_w); free(w); + gap_destroy_stack(stack); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid; + bwt_t *bwt; + int n_seqs; + bwa_seq_t *seqs; + const gap_opt_t *opt; +} thread_aux_t; + +static void *worker(void *data) +{ + thread_aux_t *d = (thread_aux_t*)data; + bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); + return 0; +} +#endif + +bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) +{ + bwa_seqio_t *ks; + if (mode & BWA_MODE_BAM) { // open BAM + int which = 0; + if (mode & BWA_MODE_BAM_SE) which |= 4; + if (mode & BWA_MODE_BAM_READ1) which |= 1; + if (mode & BWA_MODE_BAM_READ2) which |= 2; + if (which == 0) which = 7; // then read all reads + ks = bwa_bam_open(fn_fa, which); + } else ks = bwa_seq_open(fn_fa); + return ks; +} + +void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) +{ + int i, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bwt_t *bwt; + + // initialization + ks = bwa_open_reads(opt->mode, fn_fa); + + { // load BWT + char *str = (char*)calloc(strlen(prefix) + 10, 1); + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + free(str); + } + + // core loop + err_fwrite(SAI_MAGIC, 1, 4, stdout); + err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { // no multi-threading at all + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + data[j].tid = j; data[j].bwt = bwt; + data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; + pthread_create(&tid[j], &attr, worker, data + j); + } + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + free(data); free(tid); + } +#else + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); +#endif + + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + + t = clock(); + fprintf(stderr, "[bwa_aln_core] write to the disk... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + err_fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwt_destroy(bwt); + bwa_seq_close(ks); +} + +int bwa_aln(int argc, char *argv[]) +{ + int c, opte = -1; + gap_opt_t *opt; + char *prefix; + + opt = gap_init_opt(); + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + switch (c) { + case 'n': + if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; + else opt->max_diff = atoi(optarg), opt->fnr = -1.0; + break; + case 'o': opt->max_gapo = atoi(optarg); break; + case 'e': opte = atoi(optarg); break; + case 'M': opt->s_mm = atoi(optarg); break; + case 'O': opt->s_gapo = atoi(optarg); break; + case 'E': opt->s_gape = atoi(optarg); break; + case 'd': opt->max_del_occ = atoi(optarg); break; + case 'i': opt->indel_end_skip = atoi(optarg); break; + case 'l': opt->seed_len = atoi(optarg); break; + case 'k': opt->max_seed_diff = atoi(optarg); break; + case 'm': opt->max_entries = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'L': opt->mode |= BWA_MODE_LOGGAP; break; + case 'R': opt->max_top2 = atoi(optarg); break; + case 'q': opt->trim_qual = atoi(optarg); break; + case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; + case 'f': xreopen(optarg, "wb", stdout); break; + case 'b': opt->mode |= BWA_MODE_BAM; break; + case '0': opt->mode |= BWA_MODE_BAM_SE; break; + case '1': opt->mode |= BWA_MODE_BAM_READ1; break; + case '2': opt->mode |= BWA_MODE_BAM_READ2; break; + case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; + case 'B': opt->mode |= atoi(optarg) << 24; break; + default: return 1; + } + } + if (opte > 0) { + opt->max_gape = opte; + opt->mode &= ~BWA_MODE_GAPE; + } + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa aln [options] \n\n"); + fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", + BWA_AVG_ERR, opt->fnr); + fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); + fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); + fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); + fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); + fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); + fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); + fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); + fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); + fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); + fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); + fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); + fprintf(stderr, " -B INT length of barcode\n"); + fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); + fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); + fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); + fprintf(stderr, " -b the input read file is in the BAM format\n"); + fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); + fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); + fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); + fprintf(stderr, "\n"); + return 1; + } + if (opt->fnr > 0.0) { + int i, k; + for (i = 17, k = 0; i <= 250; ++i) { + int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); + k = l; + } + } + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(opt); + return 1; + } + bwa_aln_core(prefix, argv[optind+1], opt); + free(opt); free(prefix); + return 0; +} diff --git a/src/bwa/bwtaln.h b/src/bwa/bwtaln.h new file mode 100644 index 000000000..4616ff5a6 --- /dev/null +++ b/src/bwa/bwtaln.h @@ -0,0 +1,153 @@ +#ifndef BWTALN_H +#define BWTALN_H + +#include +#include "bwt.h" + +#define BWA_TYPE_NO_MATCH 0 +#define BWA_TYPE_UNIQUE 1 +#define BWA_TYPE_REPEAT 2 +#define BWA_TYPE_MATESW 3 + +#define SAM_FPD 1 // paired +#define SAM_FPP 2 // properly paired +#define SAM_FSU 4 // self-unmapped +#define SAM_FMU 8 // mate-unmapped +#define SAM_FSR 16 // self on the reverse strand +#define SAM_FMR 32 // mate on the reverse strand +#define SAM_FR1 64 // this is read one +#define SAM_FR2 128 // this is read two +#define SAM_FSC 256 // secondary alignment + +#define BWA_AVG_ERR 0.02 +#define BWA_MIN_RDLEN 35 // for read trimming + +#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum + +#ifndef bns_pac +#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) +#endif + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + +#define SAI_MAGIC "SAI\1" + +typedef struct { + bwtint_t w; + int bid; +} bwt_width_t; + +typedef struct { + uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; + bwtint_t k, l; +} bwt_aln1_t; + +typedef uint16_t bwa_cigar_t; +/* rgoya: If changing order of bytes, beware of operations like: + * s->cigar[0] += s->full_len - s->len; + */ +#define CIGAR_OP_SHIFT 14 +#define CIGAR_LN_MASK 0x3fff + +#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT) +#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK) +#define __cigar_create(__op, __len) ((__op)< +#include +#include +#include "bwtgap.h" +#include "bwtaln.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define STATE_M 0 +#define STATE_I 1 +#define STATE_D 2 + +#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) + +gap_stack_t *gap_init_stack2(int max_score) +{ + gap_stack_t *stack; + stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); + stack->n_stacks = max_score; + stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); + return stack; +} + +gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +{ + return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); +} + +void gap_destroy_stack(gap_stack_t *stack) +{ + int i; + for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack); + free(stack->stacks); + free(stack); +} + +static void gap_reset_stack(gap_stack_t *stack) +{ + int i; + for (i = 0; i != stack->n_stacks; ++i) + stack->stacks[i].n_entries = 0; + stack->best = stack->n_stacks; + stack->n_entries = 0; +} + +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del, + int state, int is_diff, const gap_opt_t *opt) +{ + int score; + gap_entry_t *p; + gap_stack1_t *q; + score = aln_score(n_mm, n_gapo, n_gape, opt); + q = stack->stacks + score; + if (q->n_entries == q->m_entries) { + q->m_entries = q->m_entries? q->m_entries<<1 : 4; + q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); + } + p = q->stack + q->n_entries; + p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; + p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; + p->n_ins = n_ins; p->n_del = n_del; + p->state = state; + p->last_diff_pos = is_diff? i : 0; + ++(q->n_entries); + ++(stack->n_entries); + if (stack->best > score) stack->best = score; +} + +static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e) +{ + gap_stack1_t *q; + q = stack->stacks + stack->best; + *e = q->stack[q->n_entries - 1]; + --(q->n_entries); + --(stack->n_entries); + if (q->n_entries == 0 && stack->n_entries) { // reset best + int i; + for (i = stack->best + 1; i < stack->n_stacks; ++i) + if (stack->stacks[i].n_entries != 0) break; + stack->best = i; + } else if (stack->n_entries == 0) stack->best = stack->n_stacks; +} + +static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w) +{ + int i, j; + for (i = j = 0; i < last_diff_pos; ++i) { + if (w[i].w > x) w[i].w -= x; + else if (w[i].w == x) { + w[i].bid = 1; + w[i].w = max - (++j); + } // else should not happen + } +} + +static inline int int_log2(uint32_t v) +{ + int c = 0; + if (v & 0xffff0000u) { v >>= 16; c |= 16; } + if (v & 0xff00) { v >>= 8; c |= 8; } + if (v & 0xf0) { v >>= 4; c |= 4; } + if (v & 0xc) { v >>= 2; c |= 2; } + if (v & 0x2) c |= 1; + return c; +} + +bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, + bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) +{ // $seq is the reverse complement of the input read + int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); + int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; + int best_cnt = 0; + int max_entries = 0, j, _j, n_aln, m_aln; + bwt_aln1_t *aln; + + m_aln = 4; n_aln = 0; + aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); + + // check whether there are too many N + for (j = _j = 0; j < len; ++j) + if (seq[j] > 3) ++_j; + if (_j > max_diff) { + *_n_aln = n_aln; + return aln; + } + + //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); + gap_reset_stack(stack); // reset stack + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt); + + while (stack->n_entries) { + gap_entry_t e; + int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; + bwtint_t k, l, cnt_k[4], cnt_l[4], occ; + + if (max_entries < stack->n_entries) max_entries = stack->n_entries; + if (stack->n_entries > opt->max_entries) break; + gap_pop(stack, &e); // get the best entry + k = e.k; l = e.l; // SA interval + i = e.info&0xffff; // length + if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed + + m = max_diff - (e.n_mm + e.n_gapo); + if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; + if (m < 0) continue; + if (seed_width) { // apply seeding + m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); + if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; + } + //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); + if (i > 0 && m < width[i-1].bid) continue; + + // check whether a hit is found + hit_found = 0; + if (i == 0) hit_found = 1; + else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed + if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; + else continue; // no hit, skip + } + + if (hit_found) { // action for found hits + int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); + int do_add = 1; + //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); + if (n_aln == 0) { + best_score = score; + best_diff = e.n_mm + e.n_gapo; + if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape; + if (!(opt->mode & BWA_MODE_NONSTOP)) + max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour + } + if (score == best_score) best_cnt += l - k + 1; + else if (best_cnt > opt->max_top2) break; // top2b behaviour + if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat + for (j = 0; j != n_aln; ++j) + if (aln[j].k == k && aln[j].l == l) break; + if (j < n_aln) do_add = 0; + } + if (do_add) { // append + bwt_aln1_t *p; + gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); + if (n_aln == m_aln) { + m_aln <<= 1; + aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); + memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); + } + p = aln + n_aln; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; + p->n_ins = e.n_ins; p->n_del = e.n_del; + p->k = k; p->l = l; + p->score = score; + //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del); + ++n_aln; + } + continue; + } + + --i; + bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values + occ = l - k + 1; + // test whether diff is allowed + allow_diff = allow_M = 1; + if (i > 0) { + int ii = i - (len - opt->seed_len); + if (width[i-1].bid > m-1) allow_diff = 0; + else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; + if (seed_width && ii > 0) { + if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; + else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 + && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; + } + } + // indels + tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape; + if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) { + if (e.state == STATE_M) { // gap open + if (e.n_gapo < opt->max_gapo) { // gap open is allowed + // insertion + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt); + // deletion + for (j = 0; j != 4; ++j) { + k = bwt->L2[j] + cnt_k[j] + 1; + l = bwt->L2[j] + cnt_l[j]; + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt); + } + } + } else if (e.state == STATE_I) { // extention of an insertion + if (e.n_gape < opt->max_gape) // gap extention is allowed + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt); + } else if (e.state == STATE_D) { // extention of a deletion + if (e.n_gape < opt->max_gape) { // gap extention is allowed + if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { + for (j = 0; j != 4; ++j) { + k = bwt->L2[j] + cnt_k[j] + 1; + l = bwt->L2[j] + cnt_l[j]; + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt); + } + } + } + } + } + // mismatches + if (allow_diff && allow_M) { // mismatch is allowed + for (j = 1; j <= 4; ++j) { + int c = (seq[i] + j) & 3; + int is_mm = (j != 4 || seq[i] > 3); + k = bwt->L2[c] + cnt_k[c] + 1; + l = bwt->L2[c] + cnt_l[c]; + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt); + } + } else if (seq[i] < 4) { // try exact match only + int c = seq[i] & 3; + k = bwt->L2[c] + cnt_k[c] + 1; + l = bwt->L2[c] + cnt_l[c]; + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt); + } + } + + *_n_aln = n_aln; + //fprintf(stderr, "max_entries = %d\n", max_entries); + return aln; +} diff --git a/src/bwa/bwtgap.h b/src/bwa/bwtgap.h new file mode 100644 index 000000000..7dd616590 --- /dev/null +++ b/src/bwa/bwtgap.h @@ -0,0 +1,40 @@ +#ifndef BWTGAP_H_ +#define BWTGAP_H_ + +#include "bwt.h" +#include "bwtaln.h" + +typedef struct { // recursion stack + u_int32_t info; // score<<21 | i + u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; + u_int32_t n_ins:16, n_del:16; + int last_diff_pos; + bwtint_t k, l; // (k,l) is the SA region of [i,n-1] +} gap_entry_t; + +typedef struct { + int n_entries, m_entries; + gap_entry_t *stack; +} gap_stack1_t; + +typedef struct { + int n_stacks, best, n_entries; + gap_stack1_t *stacks; +} gap_stack_t; + +#ifdef __cplusplus +extern "C" { +#endif + + gap_stack_t *gap_init_stack2(int max_score); + gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); + void gap_destroy_stack(gap_stack_t *stack); + bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, + bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bwtindex.c b/src/bwa/bwtindex.c new file mode 100644 index 000000000..aa78b73d8 --- /dev/null +++ b/src/bwa/bwtindex.c @@ -0,0 +1,296 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include +#include "bntseq.h" +#include "bwt.h" +#include "utils.h" + +#ifdef _DIVBWT +#include "divsufsort.h" +#endif + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) +{ + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + err_fseek(fp, -1, SEEK_END); + pac_len = err_ftell(fp); + err_fread_noeof(&c, 1, 1, fp); + err_fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + err_fread_noeof(buf2, 1, pac_size, fp); + err_fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_index(int argc, char *argv[]) // the "index" command +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + + char *prefix = 0, *str, *str2, *str3; + int c, algo_type = 0, is_64 = 0, block_size = 10000000; + clock_t t; + int64_t l_pac; + + while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) { + switch (c) { + case 'a': // if -a is not set, algo_type will be determined later + if (strcmp(optarg, "div") == 0) algo_type = 1; + else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; + else if (strcmp(optarg, "is") == 0) algo_type = 3; + else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); + break; + case 'p': prefix = strdup(optarg); break; + case '6': is_64 = 1; break; + case 'b': + block_size = strtol(optarg, &str, 10); + if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024; + else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024; + else if (*str == 'K' || *str == 'k') block_size *= 1024; + break; + default: return 1; + } + } + + if (optind + 1 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa index [options] \n\n"); + fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); + fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); + fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size); + fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); + fprintf(stderr, " `-a div' do not work not for long genomes.\n\n"); + return 1; + } + + fprintf(stdout, "\n\nstarting indexing\n"); //ddebug + if (prefix == 0) { + prefix = malloc(strlen(argv[optind]) + 4); + strcpy(prefix, argv[optind]); + if (is_64) strcat(prefix, ".64"); + } + str = (char*)calloc(strlen(prefix) + 10, 1); + str2 = (char*)calloc(strlen(prefix) + 10, 1); + str3 = (char*)calloc(strlen(prefix) + 10, 1); + + { // nucleotide indexing + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack FASTA... "); + l_pac = bns_fasta2bntseq(fp, prefix, 0); + fprintf(stdout, "\n\nlength pac: %d\n", l_pac); //ddebug + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + err_gzclose(fp); + } + if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT + { + strcpy(str, prefix); strcat(str, ".pac"); + strcpy(str2, prefix); strcat(str2, ".bwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); + if (algo_type == 2) bwt_bwtgen2(str, str2, block_size); + else if (algo_type == 1 || algo_type == 3) { + bwt_t *bwt; + bwt = bwt_pac2bwt(str, algo_type == 3); + bwt_dump_bwt(str2, bwt); + bwt_destroy(bwt); + } + fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".bwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Update BWT... "); + bwt = bwt_restore_bwt(str); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(str, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); + l_pac = bns_fasta2bntseq(fp, prefix, 1); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + err_gzclose(fp); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".bwt"); + strcpy(str3, prefix); strcat(str3, ".sa"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); + bwt = bwt_restore_bwt(str); + bwt_cal_sa(bwt, 32); + bwt_dump_sa(str3, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + free(str3); free(str2); free(str); free(prefix); + return 0; +} diff --git a/src/bwa/bwtsw2.h b/src/bwa/bwtsw2.h new file mode 100644 index 000000000..0ec96767c --- /dev/null +++ b/src/bwa/bwtsw2.h @@ -0,0 +1,69 @@ +#ifndef LH3_BWTSW2_H +#define LH3_BWTSW2_H + +#include +#include "bntseq.h" +#include "bwt_lite.h" +#include "bwt.h" + +#define BSW2_FLAG_MATESW 0x100 +#define BSW2_FLAG_TANDEM 0x200 +#define BSW2_FLAG_MOVED 0x400 +#define BSW2_FLAG_RESCUED 0x800 + +typedef struct { + int skip_sw:8, cpy_cmt:8, hard_clip:16; + int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; + int z, is, t_seeds, multi_2nd; + float mask_level, coef; + int n_threads, chunk_size; +} bsw2opt_t; + +typedef struct { + bwtint_t k, l; + uint32_t flag:18, n_seeds:13, is_rev:1; + int len, G, G2; + int beg, end; +} bsw2hit_t; + +typedef struct { + int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm; + uint32_t *cigar; +} bsw2aux_t; + +typedef struct { + int n, max; + bsw2hit_t *hits; + bsw2aux_t *aux; +} bwtsw2_t; + +typedef struct { + void *stack; + int max_l; + uint8_t *aln_mem; +} bsw2global_t; + +typedef struct { + int l, tid; + char *name, *seq, *qual, *sam, *comment; +} bsw2seq1_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bsw2opt_t *bsw2_init_opt(); + bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2); + void bsw2_destroy(bwtsw2_t *b); + + bsw2global_t *bsw2_global_init(); + void bsw2_global_destroy(bsw2global_t *_pool); + + void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/bwtsw2_aux.c b/src/bwa/bwtsw2_aux.c new file mode 100644 index 000000000..d225187ed --- /dev/null +++ b/src/bwa/bwtsw2_aux.c @@ -0,0 +1,776 @@ +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef HAVE_PTHREAD +#include +#endif +#include "bntseq.h" +#include "bwt_lite.h" +#include "utils.h" +#include "bwtsw2.h" +#include "kstring.h" +#include "bwa.h" +#include "ksw.h" + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +#include "ksort.h" +#define __left_lt(a, b) ((a).end > (b).end) +KSORT_INIT(hit, bsw2hit_t, __left_lt) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + +extern unsigned char nst_nt4_table[256]; + +unsigned char nt_comp_table[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', + 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', + 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n', + 'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' +}; + +extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); +extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); + +bsw2opt_t *bsw2_init_opt() +{ + bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); + o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; + o->bw = 50; + o->max_ins = 20000; + o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; + o->mask_level = 0.50f; o->coef = 5.5f; + o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; + o->max_chain_gap = 10000; + o->cpy_cmt = 0; + return o; +} + +void bsw2_destroy(bwtsw2_t *b) +{ + int i; + if (b == 0) return; + if (b->aux) + for (i = 0; i < b->n; ++i) free(b->aux[i].cigar); + free(b->aux); free(b->hits); + free(b); +} + +bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) +{ + bwtsw2_t *p; + p = calloc(1, sizeof(bwtsw2_t)); + p->max = p->n = b->n; + if (b->n) { + kroundup32(p->max); + p->hits = calloc(p->max, sizeof(bsw2hit_t)); + memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); + } + return p; +} + +#define __gen_ap(par, opt) do { \ + int i; \ + for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ + for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \ + (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \ + (par).gap_end = (opt)->r; \ + (par).row = 5; (par).band_width = opt->bw; \ + } while (0) + +void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) +{ + int i; + bwtint_t k; + uint8_t *target = 0, *query; + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); + query = calloc(lq, 1); + // sort according to the descending order of query end + ks_introsort(hit, b->n, b->hits); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + // reverse _query + for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; + // core loop + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; + int score, j, qle, tle; + p->n_seeds = 1; + if (p->l || p->k == 0) continue; + for (j = score = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { + if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; + ++score; + } + } + if (score) continue; + if (lt > p->k) lt = p->k; + for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; + lt = j; + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); + if (score > p->G) { // extensible + p->G = score; + p->k -= tle; + p->len += tle; + p->beg -= qle; + } + } + free(query); free(target); +} + +void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) +{ + int i; + bwtint_t k; + uint8_t *target; + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; + int j, score, qle, tle; + if (p->l) continue; + for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; + lt = j; + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; +// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); + if (score >= p->G) { + p->G = score; + p->len = tle; + p->end = p->beg + qle; + } + } + free(target); +} + +/* generate CIGAR array(s) in b->cigar[] */ +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) +{ + int i; + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + bsw2aux_t *q = b->aux + i; + uint8_t *query; + int beg, end, score; + if (p->l) continue; + beg = (p->flag & 0x10)? lq - p->end : p->beg; + end = (p->flag & 0x10)? lq - p->beg : p->end; + query = seq[(p->flag & 0x10)? 1 : 0] + beg; + q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); +#if 0 + if (name && score != p->G) { // debugging only + int j, glen = 0; + for (j = 0; j < q->n_cigar; ++j) + if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) + glen += q->cigar[j]>>4; + fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", + __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); + } +#endif + if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping + q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); + if (beg != 0) { + memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); + q->cigar[0] = beg<<4 | 4; + ++q->n_cigar; + } + if (end < lq) { + q->cigar[q->n_cigar] = (lq - end)<<4 | 4; + ++q->n_cigar; + } + } + } +} + +/* this is for the debugging purpose only */ +void bsw2_debug_hits(const bwtsw2_t *b) +{ + int i; + printf("# raw hits: %d\n", b->n); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + if (p->G > 0) + printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); + } +} + +static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) +{ + int i; + if (b[0]->n + b[1]->n > b[0]->max) { + b[0]->max = b[0]->n + b[1]->n; + b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); + } + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[0]->hits + b[0]->n + i; + *p = b[1]->hits[i]; + if (is_reverse) { + int x = p->beg; + p->beg = l - p->end; + p->end = l - x; + p->flag |= 0x10; + } + } + b[0]->n += b[1]->n; + bsw2_destroy(b[1]); + b[1] = 0; +} +/* seq[0] is the forward sequence and seq[1] is the reverse complement. */ +static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, + int l, uint8_t *seq[2], bsw2global_t *pool) +{ + extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); + bwtsw2_t *b[2], **bb[2], **_b, *p; + int k, j; + bwtl_t *query; + query = bwtl_seq2bwtl(l, seq[0]); + _b = bsw2_core(bns, opt, query, target, pool); + bwtl_destroy(query); + for (k = 0; k < 2; ++k) { + bb[k] = calloc(2, sizeof(void*)); + bb[k][0] = calloc(1, sizeof(bwtsw2_t)); + bb[k][1] = calloc(1, sizeof(bwtsw2_t)); + } + for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand + for (j = 0; j < _b[k]->n; ++j) { + bsw2hit_t *q; + p = bb[_b[k]->hits[j].is_rev][k]; + if (p->n == p->max) { + p->max = p->max? p->max<<1 : 8; + p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); + } + q = &p->hits[p->n++]; + *q = _b[k]->hits[j]; + if (_b[k]->hits[j].is_rev) { + int x = q->beg; + q->beg = l - q->end; + q->end = l - x; + } + } + } + b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" + bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained + for (k = 0; k < 2; ++k) { + bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); + merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here + bsw2_resolve_duphits(0, 0, bb[k][0], 0); + bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); + bsw2_resolve_duphits(0, 0, bb[k][0], 0); + b[k] = bb[k][0]; + free(bb[k]); + } + merge_hits(b, l, 1); // again, b[1] is merged to b[0] + bsw2_resolve_query_overlaps(b[0], opt->mask_level); + bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b); + return b[0]; +} + +/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */ +static void flag_fr(bwtsw2_t *b[2]) +{ + int i, j; + for (i = 0; i < b[0]->n; ++i) { + bsw2hit_t *p = b[0]->hits + i; + p->flag |= 0x10000; + } + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[1]->hits + i; + p->flag |= 0x20000; + } + for (i = 0; i < b[0]->n; ++i) { + bsw2hit_t *p = b[0]->hits + i; + for (j = 0; j < b[1]->n; ++j) { + bsw2hit_t *q = b[1]->hits + j; + if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) { + q->flag |= 0x30000; p->flag |= 0x30000; + break; + } + } + } +} + +typedef struct { + int n, max; + bsw2seq1_t *seq; +} bsw2seq_t; + +static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) +{ + // FIXME: this routine does not work if the query bridge three reference sequences + int32_t coor, refl, lq; + int x, y, i, seqid; + bns_cnt_ambi(bns, p->k, p->len, &seqid); + coor = p->k - bns->anns[seqid].offset; + refl = bns->anns[seqid].len; + x = coor; y = 0; + // test if the alignment goes beyond the boundary + for (i = 0; i < n_cigar; ++i) { + int op = cigar[i]&0xf, ln = cigar[i]>>4; + if (op == 1 || op == 4 || op == 5) y += ln; + else if (op == 2) x += ln; + else x += ln, y += ln; + } + lq = y; // length of the query sequence + if (x > refl) { // then fix it + int j, nc, mq[2], nlen[2]; + uint32_t *cn; + bwtint_t kk = 0; + nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; + cn = calloc(n_cigar + 3, 4); + x = coor; y = 0; + for (i = j = 0; i < n_cigar; ++i) { + int op = cigar[i]&0xf, ln = cigar[i]>>4; + if (op == 4 || op == 5 || op == 1) { // ins or clipping + y += ln; + cn[j++] = cigar[i]; + } else if (op == 2) { // del + if (x + ln >= refl && nc == 0) { + cn[j++] = (uint32_t)(lq - y)<<4 | 4; + nc = j; + cn[j++] = (uint32_t)y<<4 | 4; + kk = p->k + (x + ln - refl); + nlen[0] = x - coor; + nlen[1] = p->len - nlen[0] - ln; + } else cn[j++] = cigar[i]; + x += ln; + } else if (op == 0) { // match + if (x + ln >= refl && nc == 0) { + // FIXME: not consider a special case where a split right between M and I + cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M + cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S + nc = j; + mq[0] += refl - x; + cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4; + if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0; + mq[1] += x + ln - refl; + kk = bns->anns[seqid].offset + refl; + nlen[0] = refl - coor; + nlen[1] = p->len - nlen[0]; + } else { + cn[j++] = cigar[i]; + mq[nc?1:0] += ln; + } + x += ln; y += ln; + } + } + if (mq[0] > mq[1]) { // then take the first alignment + n_cigar = nc; + memcpy(cigar, cn, 4 * nc); + p->len = nlen[0]; + } else { + p->k = kk; p->len = nlen[1]; + n_cigar = j - nc; + memcpy(cigar, cn + nc, 4 * (j - nc)); + } + free(cn); + } + return n_cigar; +} + +static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +{ + int i; + // allocate for b->aux + if (b->n<<1 < b->max) { + b->max = b->n; + kroundup32(b->max); + b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); + } + b->aux = calloc(b->n, sizeof(bsw2aux_t)); + // generate CIGAR + gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); + // fix CIGAR, generate mapQ, and write chromosomal position + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = &b->hits[i]; + bsw2aux_t *q = &b->aux[i]; + q->flag = p->flag & 0xfe; + q->isize = 0; + if (p->l == 0) { // unique hit + float c = 1.0; + int subo; + // fix out-of-boundary CIGAR + q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); + // compute mapQ + subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (q->qual > 250) q->qual = 250; + if (q->qual < 0) q->qual = 0; + if (p->flag&1) q->qual = 0; // this is a random hit + q->pqual = q->qual; // set the paired qual as qual + // get the chromosomal position + q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr); + q->pos = p->k - bns->anns[q->chr].offset; + } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0; + } +} + +static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) +{ + int i; + if (m == 0) return; + // update flag, mchr and mpos + for (i = 0; i < b->n; ++i) { + bsw2aux_t *q = &b->aux[i]; + q->flag |= 1; // paired + if (m->n == 0) q->flag |= 8; // mate unmapped + if (m->n == 1) { + q->mchr = m->aux[0].chr; + q->mpos = m->aux[0].pos; + if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand + if (q->chr == q->mchr) { // set insert size + if (q->mpos + m->hits[0].len > q->pos) + q->isize = q->mpos + m->hits[0].len - q->pos; + else q->isize = q->mpos - q->pos - b->hits[0].len; + } else q->isize = 0; + } else q->mchr = q->mpos = -1; + } + // update mapping quality + if (b->n == 1 && m->n == 1) { + bsw2hit_t *p = &b->hits[0]; + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman + if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20) + b->aux[0].pqual = 20; + if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual] + b->aux[0].pqual += 20; + if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual; + } + } + } +} + +/* generate SAM lines for a sequence in ks with alignment stored in + * b. ks->name and ks->seq will be freed and set to NULL in the end. */ +static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) +{ + int i, k; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + if (b == 0 || b->n == 0) { // no hits + ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name); + for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str); + if (ks->qual) { + kputc('\t', &str); + for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str); + } else kputs("\t*", &str); + kputc('\n', &str); + } + for (i = 0; b && i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + bsw2aux_t *q = b->aux + i; + int j, beg, end, type = 0; + // print mandatory fields before SEQ + if (q->cigar == 0) q->flag |= 0x4; + ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); + ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); + if (p->l == 0 && q->cigar) { // not a repetitive hit + ksprintf(&str, "\t%d\t", q->pqual); + for (k = 0; k < q->n_cigar; ++k) + ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); + } else ksprintf(&str, "\t0\t*"); + if (!is_pe) kputs("\t*\t0\t0\t", &str); + else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); + // get the sequence begin and end + beg = 0; end = ks->l; + if (opt->hard_clip && q->cigar) { + if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; + if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; + } + for (j = beg; j < end; ++j) { + if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); + else kputc(ks->seq[j], &str); + } + // print base quality if present + if (ks->qual) { + kputc('\t', &str); + for (j = beg; j < end; ++j) { + if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); + else kputc(ks->qual[j], &str); + } + } else kputs("\t*", &str); + // print optional tags + ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); + if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); + if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); + if (p->flag&BSW2_FLAG_MATESW) type |= 1; + if (p->flag&BSW2_FLAG_TANDEM) type |= 2; + if (type) ksprintf(&str, "\tXT:i:%d", type); + if (opt->cpy_cmt && ks->comment) { + int l = strlen(ks->comment); + if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') { + kputc('\t', &str); kputs(ks->comment, &str); + } + } + kputc('\n', &str); + } + ks->sam = str.s; + free(ks->seq); ks->seq = 0; + free(ks->qual); ks->qual = 0; + free(ks->name); ks->name = 0; +} + +static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) +{ + double ll = log(qlen); + int i, k; + *dst = *src; + if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); + // set band width: the query length sets a boundary on the maximum band width + k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); + i = (qlen * dst->a - dst->a - dst->t) / dst->r; + if (k > i) k = i; + if (k < 1) k = 1; // I do not know if k==0 causes troubles + dst->bw = src->bw < k? src->bw : k; +} + +/* Core routine to align reads in _seq. It is separated from + * process_seqs() to realize multi-threading */ +static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) +{ + int x; + bsw2opt_t opt; + bsw2global_t *pool = bsw2_global_init(); + bwtsw2_t **buf; + buf = calloc(_seq->n, sizeof(void*)); + for (x = 0; x < _seq->n; ++x) { + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2], *rseq[2]; + int i, l, k; + bwtsw2_t *b[2]; + l = p->l; + update_opt(&opt, _opt, p->l); + if (pool->max_l < l) { // then enlarge working space for aln_extend_core() + int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; + pool->max_l = l; + pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); + } + // set seq[2] and rseq[2] + seq[0] = calloc(l * 4, 1); + seq[1] = seq[0] + l; + rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; + // convert sequences to 2-bit representation + for (i = k = 0; i < l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled + seq[0][i] = c; + seq[1][l-1-i] = 3 - c; + rseq[0][l-1-i] = 3 - c; + rseq[1][i] = c; + } + if (l - k < opt.t) { // too few unambiguous bases + buf[x] = calloc(1, sizeof(bwtsw2_t)); + free(seq[0]); continue; + } + // alignment + b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool); + for (k = 0; k < b[0]->n; ++k) + if (b[0]->hits[k].n_seeds < opt.t_seeds) break; + if (k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = &b[1]->hits[i]; + int x = p->beg; + p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand + p->beg = l - p->end; + p->end = l - x; + } + flag_fr(b); + merge_hits(b, l, 0); + bsw2_resolve_duphits(0, 0, b[0], 0); + bsw2_resolve_query_overlaps(b[0], opt.mask_level); + } else b[1] = 0; + // generate CIGAR and print SAM + buf[x] = bsw2_dup_no_cigar(b[0]); + // free + free(seq[0]); + bsw2_destroy(b[0]); + } + if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); + for (x = 0; x < _seq->n; ++x) { + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2]; + int i; + seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; + for (i = 0; i < p->l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) c = (int)(drand48() * 4); + seq[0][i] = c; + seq[1][p->l-1-i] = 3 - c; + } + update_opt(&opt, _opt, p->l); + write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); + free(seq[0]); + } + for (x = 0; x < _seq->n; ++x) { + if (is_pe) update_mate_aux(buf[x], buf[x^1]); + print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]); + } + for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); + free(buf); + bsw2_global_destroy(pool); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid, is_pe; + bsw2seq_t *_seq; + const bsw2opt_t *_opt; + const bntseq_t *bns; + uint8_t *pac; + const bwt_t *target; +} thread_aux_t; + +/* another interface to bsw2_aln_core() to facilitate pthread_create() */ +static void *worker(void *data) +{ + thread_aux_t *p = (thread_aux_t*)data; + bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); + return 0; +} +#endif + +/* process sequences stored in _seq, generate SAM lines for these + * sequences and reset _seq afterwards. */ +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) +{ + int i; + is_pe = is_pe? 1 : 0; + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; + p->pac = pac; p->target = target; + p->_seq = calloc(1, sizeof(bsw2seq_t)); + p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; + p->_seq->n = 0; + p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); + } + for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + p->seq[p->n++] = _seq->seq[i]; + } + for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]); + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0; + for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + _seq->seq[i] = p->seq[p->n++]; + } + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + free(p->_seq->seq); + free(p->_seq); + } + free(data); free(tid); + } +#else + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); +#endif + + // print and reset + for (i = 0; i < _seq->n; ++i) { + bsw2seq1_t *p = _seq->seq + i; + if (p->sam) err_printf("%s", p->sam); + free(p->name); free(p->seq); free(p->qual); free(p->sam); + p->tid = -1; p->l = 0; + p->name = p->seq = p->qual = p->sam = 0; + } + err_fflush(stdout); + _seq->n = 0; +} + +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) +{ + gzFile fp, fp2; + kseq_t *ks, *ks2; + int l, is_pe = 0, i, n; + uint8_t *pac; + bsw2seq_t *_seq; + bseq1_t *bseq; + + pac = calloc(bns->l_pac/4+1, 1); + for (l = 0; l < bns->n_seqs; ++l) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); + fp = xzopen(fn, "r"); + ks = kseq_init(fp); + _seq = calloc(1, sizeof(bsw2seq_t)); + if (fn2) { + fp2 = xzopen(fn2, "r"); + ks2 = kseq_init(fp2); + is_pe = 1; + } else fp2 = 0, ks2 = 0, is_pe = 0; + while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int size = 0; + if (n > _seq->max) { + _seq->max = n; + kroundup32(_seq->max); + _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); + } + _seq->n = n; + for (i = 0; i < n; ++i) { + bseq1_t *b = &bseq[i]; + bsw2seq1_t *p = &_seq->seq[i]; + p->tid = -1; p->l = b->l_seq; + p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; + size += p->l; + } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); + free(bseq); + process_seqs(_seq, opt, bns, pac, target, is_pe); + } + // free + free(pac); + free(_seq->seq); free(_seq); + kseq_destroy(ks); + err_gzclose(fp); + if (fn2) { + kseq_destroy(ks2); + err_gzclose(fp2); + } +} diff --git a/src/bwa/bwtsw2_chain.c b/src/bwa/bwtsw2_chain.c new file mode 100644 index 000000000..ade77e781 --- /dev/null +++ b/src/bwa/bwtsw2_chain.c @@ -0,0 +1,112 @@ +#include +#include "bwtsw2.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef struct { + uint32_t tbeg, tend; + int qbeg, qend; + uint32_t flag:1, idx:31; + int chain; // also reuse as a counter +} hsaip_t; + +#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg) + +#include "ksort.h" +KSORT_INIT(hsaip, hsaip_t, _hsaip_lt) + +static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain) +{ + int j, k, m = 0; + ks_introsort(hsaip, n, z); + for (j = 0; j < n; ++j) { + hsaip_t *p = z + j; + for (k = m - 1; k >= 0; --k) { + hsaip_t *q = chain + k; + int x = p->qbeg - q->qbeg; // always positive + int y = p->tbeg - q->tbeg; + if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained + if (p->qend > q->qend) q->qend = p->qend; + if (p->tend > q->tend) q->tend = p->tend; + ++q->chain; + p->chain = shift + k; + break; + } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains + } + if (k < 0) { // not added to any previous chains + chain[m] = *p; + chain[m].chain = 1; + chain[m].idx = p->chain = shift + m; + ++m; + } + } + return m; +} + +void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) +{ + hsaip_t *z[2], *chain[2]; + int i, j, k, n[2], m[2], thres = opt->t_seeds * 2; + char *flag; + // initialization + n[0] = b[0]->n; n[1] = b[1]->n; + z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + z[1] = z[0] + n[0]; + chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + for (k = j = 0; k < 2; ++k) { + for (i = 0; i < b[k]->n; ++i) { + bsw2hit_t *p = b[k]->hits + i; + hsaip_t *q = z[k] + i; + q->flag = k; q->idx = i; + q->tbeg = p->k; q->tend = p->k + p->len; + q->chain = -1; + q->qbeg = p->beg; q->qend = p->end; + } + } + // chaining + m[0] = chaining(opt, 0, n[0], z[0], chain[0]); + chain[1] = chain[0] + m[0]; + m[1] = chaining(opt, m[0], n[1], z[1], chain[1]); + // change query coordinate on the reverse strand + for (k = 0; k < m[1]; ++k) { + hsaip_t *p = chain[1] + k; + int tmp = p->qbeg; + p->qbeg = len - p->qend; p->qend = len - tmp; + } + //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); + // filtering + flag = calloc(m[0] + m[1], 1); + ks_introsort(hsaip, m[0] + m[1], chain[0]); + for (k = 1; k < m[0] + m[1]; ++k) { + hsaip_t *p = chain[0] + k; + for (j = 0; j < k; ++j) { + hsaip_t *q = chain[0] + j; + if (flag[q->idx]) continue; + if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) { + flag[p->idx] = 1; + break; + } + } + } + for (k = 0; k < n[0] + n[1]; ++k) { + hsaip_t *p = z[0] + k; + if (flag[p->chain]) + b[p->flag]->hits[p->idx].G = 0; + } + free(flag); + // squeeze out filtered elements in b[2] + for (k = 0; k < 2; ++k) { + for (j = i = 0; j < n[k]; ++j) { + bsw2hit_t *p = b[k]->hits + j; + if (p->G) { + if (i != j) b[k]->hits[i++] = *p; + else ++i; + } + } + b[k]->n = i; + } + // free + free(z[0]); free(chain[0]); +} diff --git a/src/bwa/bwtsw2_core.c b/src/bwa/bwtsw2_core.c new file mode 100644 index 000000000..11196015e --- /dev/null +++ b/src/bwa/bwtsw2_core.c @@ -0,0 +1,619 @@ +#include +#include +#include +#include +#include +#include "bwt_lite.h" +#include "bwtsw2.h" +#include "bwt.h" +#include "kvec.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef struct { + bwtint_t k, l; +} qintv_t; + +#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l) +#define qintv_hash(a) ((a).k>>7^(a).l<<17) + +#include "khash.h" +KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq) +KHASH_MAP_INIT_INT64(64, uint64_t) + +#define MINUS_INF -0x3fffffff +#define MASK_LEVEL 0.90f + +struct __mempool_t; +static void mp_destroy(struct __mempool_t*); +typedef struct { + bwtint_t qk, ql; + int I, D, G; + uint32_t pj:2, qlen:30; + int tlen; + int ppos, upos; + int cpos[4]; +} bsw2cell_t; + +#include "ksort.h" +KSORT_INIT_GENERIC(int) +#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2)) +KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) + +static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; + +typedef struct { + int n, max; + uint32_t tk, tl; // this is fine + bsw2cell_t *array; +} bsw2entry_t, *bsw2entry_p; + +/* --- BEGIN: Stack operations --- */ +typedef struct { + int n_pending; + kvec_t(bsw2entry_p) stack0, pending; + struct __mempool_t *pool; +} bsw2stack_t; + +#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0) +static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); } +inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); } +inline static bsw2entry_p stack_pop(bsw2stack_t *s) +{ + assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0)); + return kv_pop(s->stack0); +} +/* --- END: Stack operations --- */ + +/* --- BEGIN: memory pool --- */ +typedef struct __mempool_t { + int cnt; // if cnt!=0, then there must be memory leak + kvec_t(bsw2entry_p) pool; +} mempool_t; +inline static bsw2entry_p mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); + else return kv_pop(mp->pool); +} +inline static void mp_free(mempool_t *mp, bsw2entry_p e) +{ + --mp->cnt; e->n = 0; + kv_push(bsw2entry_p, mp->pool, e); +} +static void mp_destroy(struct __mempool_t *mp) +{ + int i; + for (i = 0; i != kv_size(mp->pool); ++i) { + free(kv_A(mp->pool, i)->array); + free(kv_A(mp->pool, i)); + } + kv_destroy(mp->pool); + free(mp); +} +/* --- END: memory pool --- */ + +/* --- BEGIN: utilities --- */ +static khash_t(64) *bsw2_connectivity(const bwtl_t *b) +{ + khash_t(64) *h; + uint32_t k, l, cntk[4], cntl[4]; // this is fine + uint64_t x; + khiter_t iter; + int j, ret; + kvec_t(uint64_t) stack; + + kv_init(stack); + h = kh_init(64); + kh_resize(64, h, b->seq_len * 4); + x = b->seq_len; + kv_push(uint64_t, stack, x); + while (kv_size(stack)) { + x = kv_pop(stack); + k = x>>32; l = (uint32_t)x; + bwtl_2occ4(b, k-1, l, cntk, cntl); + for (j = 0; j != 4; ++j) { + k = b->L2[j] + cntk[j] + 1; + l = b->L2[j] + cntl[j]; + if (k > l) continue; + x = (uint64_t)k << 32 | l; + iter = kh_put(64, h, x, &ret); + if (ret) { // if not present + kh_value(h, iter) = 1; + kv_push(uint64_t, stack, x); + } else ++kh_value(h, iter); + } + } + kv_destroy(stack); + //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h)); + return h; +} +// pick up top T matches at a node +static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) +{ + int i, *a, n, x; + if (u->n <= T) return; + if (aux->max < u->n) { + aux->max = u->n; + aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); + } + a = (int*)aux->array; + for (i = n = 0; i != u->n; ++i) + if (u->array[i].ql && u->array[i].G > 0) + a[n++] = -u->array[i].G; + if (n <= T) return; + x = -ks_ksmall(int, n, a, T); + n = 0; + for (i = 0; i < u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->G == x) ++n; + if (p->G < x || (p->G == x && n >= T)) { + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1; + } + } +} +// remove duplicated cells +static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash) +{ + int i, ret, j; + khiter_t k; + qintv_t key; + kh_clear(qintv, hash); + for (i = 0; i != u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->ql == 0) continue; + key.k = p->qk; key.l = p->ql; + k = kh_put(qintv, hash, key, &ret); + j = -1; + if (ret == 0) { + if ((uint32_t)kh_value(hash, k) >= p->G) j = i; + else { + j = kh_value(hash, k)>>32; + kh_value(hash, k) = (uint64_t)i<<32 | p->G; + } + } else kh_value(hash, k) = (uint64_t)i<<32 | p->G; + if (j >= 0) { + p = u->array + j; + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; + } + } +} +// merge two entries +static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b) +{ + int i; + if (u->n + v->n >= u->max) { + u->max = u->n + v->n; + u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); + } + for (i = 0; i != v->n; ++i) { + bsw2cell_t *p = v->array + i; + if (p->ppos >= 0) p->ppos += u->n; + if (p->cpos[0] >= 0) p->cpos[0] += u->n; + if (p->cpos[1] >= 0) p->cpos[1] += u->n; + if (p->cpos[2] >= 0) p->cpos[2] += u->n; + if (p->cpos[3] >= 0) p->cpos[3] += u->n; + } + memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t)); + u->n += v->n; +} + +static inline bsw2cell_t *push_array_p(bsw2entry_t *e) +{ + if (e->n == e->max) { + e->max = e->max? e->max<<1 : 256; + e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); + } + return e->array + e->n; +} + +static inline double time_elapse(const struct rusage *curr, const struct rusage *last) +{ + long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec); + long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec); + return (double)t1 + t2 * 1e-6; +} +/* --- END: utilities --- */ + +/* --- BEGIN: processing partial hits --- */ +static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u) +{ + int i; + uint32_t k; // this is fine + for (i = 0; i < u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->G < thres) continue; + for (k = u->tk; k <= u->tl; ++k) { + int beg, end; + bsw2hit_t *q = 0; + beg = bwt->sa[k]; end = beg + p->tlen; + if (p->G > hits[beg*2].G) { + hits[beg*2+1] = hits[beg*2]; + q = hits + beg * 2; + } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1; + if (q) { + q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G; + q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G; + q->flag = q->n_seeds = 0; + } + } + } +} +/* "narrow hits" are node-to-node hits that have a high score and + * are not so repetitive (|SA interval|<=IS). */ +static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS) +{ + int i; + for (i = 0; i < u->n; ++i) { + bsw2hit_t *q; + bsw2cell_t *p = u->array + i; + if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit + if (b1->max == b1->n) { + b1->max = b1->max? b1->max<<1 : 4; + b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); + } + q = &b1->hits[b1->n++]; + q->k = p->qk; q->l = p->ql; + q->len = p->qlen; + q->G = p->G; q->G2 = 0; + q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen; + q->flag = 0; + // delete p + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; + } + } +} +/* after this, "narrow SA hits" will be expanded and the coordinates + * will be obtained and stored in b->hits[*].k. */ +int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) +{ + int i, j, n, is_rev; + if (b->n == 0) return 0; + if (bwt && bns) { // convert to chromosomal coordinates if requested + int old_n = b->n; + bsw2hit_t *old_hits = b->hits; + for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated + bsw2hit_t *p = old_hits + i; + if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; + else if (p->G > 0) ++n; + } + b->n = b->max = n; + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + for (i = j = 0; i < old_n; ++i) { + bsw2hit_t *p = old_hits + i; + if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive + bwtint_t k; + if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; + for (k = p->k; k <= p->l; ++k) { + b->hits[j] = *p; + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); + b->hits[j].l = 0; + b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len - 1; + ++j; + } + } else if (p->G > 0) { + b->hits[j] = *p; + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev); + b->hits[j].l = 0; + b->hits[j].flag |= 1; + b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len - 1; + ++j; + } + } + free(old_hits); + } + for (i = j = 0; i < b->n; ++i) // squeeze out empty elements + if (b->hits[i].G) b->hits[j++] = b->hits[i]; + b->n = j; + ks_introsort(hitG, b->n, b->hits); + for (i = 1; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + for (j = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + int compatible = 1; + if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates + if (p->l == 0 && q->l == 0) { + int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap + if (qol < 0) qol = 0; + if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { + int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) + - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap + if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) + compatible = 0; + } + } + if (!compatible) { + p->G = 0; + if (q->G2 < p->G2) q->G2 = p->G2; + break; + } + } + } + n = i; + for (i = j = 0; i < n; ++i) { + if (b->hits[i].G == 0) continue; + if (i != j) b->hits[j++] = b->hits[i]; + else ++j; + } + b->n = j; + return b->n; +} + +int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level) +{ + int i, j, n; + if (b->n == 0) return 0; + ks_introsort(hitG, b->n, b->hits); + { // choose a random one + int G0 = b->hits[0].G; + for (i = 1; i < b->n; ++i) + if (b->hits[i].G != G0) break; + j = (int)(i * drand48()); + if (j) { + bsw2hit_t tmp; + tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp; + } + } + for (i = 1; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int all_compatible = 1; + if (p->G == 0) break; + for (j = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + int64_t tol = 0; + int qol, compatible = 0; + float fol; + if (q->G == 0) continue; + qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); + if (qol < 0) qol = 0; + if (p->l == 0 && q->l == 0) { + tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) + - (p->k > q->k? p->k : q->k); + if (tol < 0) tol = 0; + } + fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg); + if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1; + if (!compatible) { + if (q->G2 < p->G) q->G2 = p->G; + all_compatible = 0; + } + } + if (!all_compatible) p->G = 0; + } + n = i; + for (i = j = 0; i < n; ++i) { + if (b->hits[i].G == 0) continue; + if (i != j) b->hits[j++] = b->hits[i]; + else ++j; + } + b->n = j; + return j; +} +/* --- END: processing partial hits --- */ + +/* --- BEGIN: global mem pool --- */ +bsw2global_t *bsw2_global_init() +{ + bsw2global_t *pool; + bsw2stack_t *stack; + pool = calloc(1, sizeof(bsw2global_t)); + stack = calloc(1, sizeof(bsw2stack_t)); + stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); + pool->stack = (void*)stack; + return pool; +} + +void bsw2_global_destroy(bsw2global_t *pool) +{ + stack_destroy((bsw2stack_t*)pool->stack); + free(pool->aln_mem); + free(pool); +} +/* --- END: global mem pool --- */ + +static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4]) +{ + int G = c[3]? c[3]->G + match_score : MINUS_INF; + if (c[1]) { + c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr; + if (c[0]->I > G) G = c[0]->I; + } else c[0]->I = MINUS_INF; + if (c[2]) { + c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr; + if (c[0]->D > G) G = c[0]->D; + } else c[0]->D = MINUS_INF; + return(c[0]->G = G); +} + +static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s) +{ + bsw2entry_t *u; + bsw2cell_t *x; + + u = mp_alloc(s->pool); + u->tk = 0; u->tl = target->seq_len; + x = push_array_p(u); + *x = g_default_cell; + x->G = 0; x->qk = 0; x->ql = query->seq_len; + u->n++; + stack_push0(s, u); +} +/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ +bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) +{ + bsw2stack_t *stack = (bsw2stack_t*)pool->stack; + bwtsw2_t *b, *b1, **b_ret; + int i, j, score_mat[16], *heap, heap_size, n_tot = 0; + struct rusage curr, last; + khash_t(qintv) *rhash; + khash_t(64) *chash; + + // initialize connectivity hash (chash) + chash = bsw2_connectivity(target); + // calculate score matrix + for (i = 0; i != 4; ++i) + for (j = 0; j != 4; ++j) + score_mat[i<<2|j] = (i == j)? opt->a : -opt->b; + // initialize other variables + rhash = kh_init(qintv); + init_bwtsw2(target, query, stack); + heap_size = opt->z; + heap = calloc(heap_size, sizeof(int)); + // initialize the return struct + b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b->n = b->max = target->seq_len * 2; + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b_ret = calloc(2, sizeof(void*)); + b_ret[0] = b; b_ret[1] = b1; + // initialize timer + getrusage(0, &last); + // the main loop: traversal of the DAG + while (!stack_isempty(stack)) { + int old_n, tj; + bsw2entry_t *v; + uint32_t tcntk[4], tcntl[4]; + bwtint_t k, l; + + v = stack_pop(stack); old_n = v->n; + n_tot += v->n; + + for (i = 0; i < v->n; ++i) { // test max depth and band width + bsw2cell_t *p = v->array + i; + if (p->ql == 0) continue; + if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) { + p->qk = p->ql = 0; + if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5; + } + } + + // get Occ for the DAG + bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl); + for (tj = 0; tj != 4; ++tj) { // descend to the children + bwtint_t qcntk[4], qcntl[4]; + int qj, *curr_score_mat = score_mat + tj * 4; + khiter_t iter; + bsw2entry_t *u; + + k = target->L2[tj] + tcntk[tj] + 1; + l = target->L2[tj] + tcntl[tj]; + if (k > l) continue; + // update counter + iter = kh_get(64, chash, (uint64_t)k<<32 | l); + --kh_value(chash, iter); + // initialization + u = mp_alloc(stack->pool); + u->tk = k; u->tl = l; + memset(heap, 0, sizeof(int) * opt->z); + // loop through all the nodes in v + for (i = 0; i < v->n; ++i) { + bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G + int is_added = 0; + if (p->ql == 0) continue; // deleted node + c[0] = x = push_array_p(u); + x->G = MINUS_INF; + p->upos = x->upos = -1; + if (p->ppos >= 0) { // parent has been visited + c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0; + c[3] = v->array + p->ppos; c[2] = p; + if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x + x->ppos = v->array[p->ppos].upos; // the parent pos in u + p->upos = u->n++; // the current pos in u + if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u + is_added = 1; + } + } else { + x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr; + if (x->D > 0) { + x->G = x->D; + x->I = MINUS_INF; x->ppos = -1; + p->upos = u->n++; + is_added = 1; + } + } + if (is_added) { // x has been added to u->array. fill the remaining variables + x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; + x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1; + if (x->G > -heap[0]) { + heap[0] = -x->G; + ks_heapadjust(int, 0, heap_size, heap); + } + } + if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v + if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) { + bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl); + for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie + if (p->cpos[qj] != -1) continue; // this node will be visited later + k = query->L2[qj] + qcntk[qj] + 1; + l = query->L2[qj] + qcntl[qj]; + if (k > l) { p->cpos[qj] = -2; continue; } + x = push_array_p(v); + p = v->array + i; // p may not point to the correct position after realloc + x->G = x->I = x->D = MINUS_INF; + x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen; + x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; + p->cpos[qj] = v->n++; + } // ~for(qj) + } // ~if(p->cpos[]) + } // ~if + } // ~for(i) + if (u->n) save_hits(target, opt->t, b->hits, u); + { // push u to the stack (or to the pending array) + uint32_t cnt, pos; + cnt = (uint32_t)kh_value(chash, iter); + pos = kh_value(chash, iter)>>32; + if (pos) { // something in the pending array, then merge + bsw2entry_t *w = kv_A(stack->pending, pos-1); + if (u->n) { + if (w->n < u->n) { // swap + w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w; + } + merge_entry(opt, w, u, b); + } + if (cnt == 0) { // move from pending to stack0 + remove_duplicate(w, rhash); + save_narrow_hits(target, w, b1, opt->t, opt->is); + cut_tail(w, opt->z, u); + stack_push0(stack, w); + kv_A(stack->pending, pos-1) = 0; + --stack->n_pending; + } + mp_free(stack->pool, u); + } else if (cnt) { // the first time + if (u->n) { // push to the pending queue + ++stack->n_pending; + kv_push(bsw2entry_p, stack->pending, u); + kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt; + } else mp_free(stack->pool, u); + } else { // cnt == 0, then push to the stack + bsw2entry_t *w = mp_alloc(stack->pool); + save_narrow_hits(target, u, b1, opt->t, opt->is); + cut_tail(u, opt->z, w); + mp_free(stack->pool, w); + stack_push0(stack, u); + } + } + } // ~for(tj) + mp_free(stack->pool, v); + } // while(top) + getrusage(0, &curr); + for (i = 0; i < 2; ++i) + for (j = 0; j < b_ret[i]->n; ++j) + b_ret[i]->hits[j].n_seeds = 0; + bsw2_resolve_duphits(bns, query, b, opt->is); + bsw2_resolve_duphits(bns, query, b1, opt->is); + //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); + // free + free(heap); + kh_destroy(qintv, rhash); + kh_destroy(64, chash); + stack->pending.n = stack->stack0.n = 0; + return b_ret; +} diff --git a/src/bwa/bwtsw2_main.c b/src/bwa/bwtsw2_main.c new file mode 100644 index 000000000..40a9e0aaf --- /dev/null +++ b/src/bwa/bwtsw2_main.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include "bwt.h" +#include "bwtsw2.h" +#include "utils.h" +#include "bwa.h" + +int bwa_bwtsw2(int argc, char *argv[]) +{ + bsw2opt_t *opt; + bwaidx_t *idx; + int c; + + opt = bsw2_init_opt(); + srand48(11); + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) { + switch (c) { + case 'q': opt->q = atoi(optarg); break; + case 'r': opt->r = atoi(optarg); break; + case 'a': opt->a = atoi(optarg); break; + case 'b': opt->b = atoi(optarg); break; + case 'w': opt->bw = atoi(optarg); break; + case 'T': opt->t = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'z': opt->z = atoi(optarg); break; + case 's': opt->is = atoi(optarg); break; + case 'm': opt->mask_level = atof(optarg); break; + case 'c': opt->coef = atof(optarg); break; + case 'N': opt->t_seeds = atoi(optarg); break; + case 'M': opt->multi_2nd = 1; break; + case 'H': opt->hard_clip = 1; break; + case 'f': xreopen(optarg, "w", stdout); break; + case 'I': opt->max_ins = atoi(optarg); break; + case 'S': opt->skip_sw = 1; break; + case 'C': opt->cpy_cmt = 1; break; + case 'G': opt->max_chain_gap = atoi(optarg); break; + default: return 1; + } + } + opt->qr = opt->q + opt->r; + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); + fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); + fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); + fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); + fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); + fprintf(stderr, " -w INT band width [%d]\n", opt->bw); + fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); + fprintf(stderr, "\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); + fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n"); + fprintf(stderr, " -M mark multi-part alignments as secondary\n"); + fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); + fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); + fprintf(stderr, "\n"); + fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); + fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); + fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); + fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); + fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds); + fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); + fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); + fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); + fprintf(stderr, " increase '-z' for better sensitivity.\n"); + fprintf(stderr, "\n"); + + return 1; + } + + // adjust opt for opt->a + opt->t *= opt->a; + opt->coef *= opt->a; + + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; + bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); + bwa_idx_destroy(idx); + free(opt); + + return 0; +} diff --git a/src/bwa/bwtsw2_pair.c b/src/bwa/bwtsw2_pair.c new file mode 100644 index 000000000..24905dff1 --- /dev/null +++ b/src/bwa/bwtsw2_pair.c @@ -0,0 +1,268 @@ +#include +#include +#include +#include +#include "utils.h" +#include "bwt.h" +#include "bntseq.h" +#include "bwtsw2.h" +#include "kstring.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define MIN_RATIO 0.8 +#define OUTLIER_BOUND 2.0 +#define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 + +typedef struct { + int low, high, failed; + double avg, std; +} bsw2pestat_t; + +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) +{ + int i, k, x, p25, p50, p75, tmp, max_len = 0; + uint64_t *isize; + bsw2pestat_t r; + + memset(&r, 0, sizeof(bsw2pestat_t)); + isize = calloc(n, 8); + for (i = k = 0; i < n; i += 2) { + bsw2hit_t *t[2]; + int l; + if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; + if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough + if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; + if (l >= max_ins) continue; // skip pairs with excessively large insert + max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; + max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; + isize[k++] = l; + } + ks_introsort_64(k, isize); + p25 = isize[(int)(.25 * k + .499)]; + p50 = isize[(int)(.50 * k + .499)]; + p75 = isize[(int)(.75 * k + .499)]; + ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); + if (k < 8) { + ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__); + free(isize); + r.failed = 1; + return r; + } + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; + r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (r.low > r.high) { + ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__); + free(isize); + r.failed = 1; + return r; + } + ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); + for (i = x = 0, r.avg = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.avg += isize[i], ++x; + r.avg /= x; + for (i = 0, r.std = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.std += (isize[i] - r.avg) * (isize[i] - r.avg); + r.std = sqrt(r.std / x); + ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); + tmp = (int)(p25 - 3. * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; + r.high = (int)(p75 + 3. * (p75 - p25) + .499); + if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); + ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + free(isize); + return r; +} + +typedef struct { + int n_cigar, beg, end, len; + int64_t pos; + uint32_t *cigar; +} pairaux_t; + +extern unsigned char nst_nt4_table[256]; + +void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) +{ + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); + int64_t k, beg, end; + uint8_t *seq, *ref; + int i; + // compute the region start and end + a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 + if (h->is_rev == 0) { + beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); + if (beg < h->k) beg = h->k; + end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); + a->is_rev = 1; a->flag |= 16; + } else { + beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); + end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); + if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); + a->is_rev = 0; + } + if (beg < 1) beg = 1; + if (end > l_pac) end = l_pac; + if (end - beg < l_mseq) return; + // generate the sequence + seq = malloc(l_mseq + (end - beg)); + ref = seq + l_mseq; + for (k = beg; k < end; ++k) + ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; + if (h->is_rev == 0) { + for (i = 0; i < l_mseq; ++i) { // on the reverse strand + int c = nst_nt4_table[(int)mseq[i]]; + seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; + } + } else { + for (i = 0; i < l_mseq; ++i) // on the forward strand + seq[i] = nst_nt4_table[(int)mseq[i]]; + } + { + int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; + kswr_t aln; + aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); + a->G = aln.score; + a->G2 = aln.score2; + if (a->G < opt->t) a->G = 0; + if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; + a->k = beg + aln.tb; + a->len = aln.te - aln.tb + 1; + a->beg = aln.qb; + a->end = aln.qe + 1; + /* + printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); + printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); + printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); + */ + } + if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; + free(seq); +} + +void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); + bsw2pestat_t pes; + int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; + int8_t g_mat[25]; + kstring_t msg; + memset(&msg, 0, sizeof(kstring_t)); + pes = bsw2_stat(n, hits, &msg, opt->max_ins); + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + g_mat[k++] = i == j? opt->a : -opt->b; + g_mat[k++] = 0; + } + for (i = 0; i < n; i += 2) { + bsw2hit_t a[2]; + memset(&a, 0, sizeof(bsw2hit_t) * 2); + a[0].flag = 1<<6; a[1].flag = 1<<7; + for (j = 0; j < 2; ++j) { // set the read1/2 flag + if (hits[i+j] == 0) continue; + for (k = 0; k < hits[i+j]->n; ++k) { + bsw2hit_t *p = &hits[i+j]->hits[k]; + p->flag |= 1<<(6+j); + } + } + if (pes.failed) continue; + if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N + if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit + if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit + if (!opt->skip_sw) { + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 + // the following enumerate all possibilities. It is tedious but necessary... + if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; + bwtsw2_t *p[2]; + int which; + if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; + else p[0] = hits[i+1], p[1] = hits[i], which = 0; + if (a[which].G == 0) continue; + a[which].flag |= BSW2_FLAG_RESCUED; + if (p[1]->max == 0) { + p[1]->max = 1; + p[1]->hits = malloc(sizeof(bsw2hit_t)); + } + p[1]->hits[0] = a[which]; + p[1]->n = 1; + p[0]->hits[0].flag |= 2; + p[1]->hits[0].flag |= 2; + ++n_rescued; + } else { // then both ends mapped + int is_fixed = 0; + //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); + for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score + bsw2hit_t *p = &hits[i+j]->hits[0]; + if (p->G < a[j].G) { // the orginal mapping is suboptimal + a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM? + *p = a[j]; + ++n_fixed; + is_fixed = 1; + } else if (p->k != a[j].k && p->G2 < a[j].G) { + p->G2 = a[j].G; + } else if (p->k == a[j].k && p->G2 < a[j].G2) { + p->G2 = a[j].G2; + } + } + if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved + for (j = 0; j < 2; ++j) + hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM); + } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match + for (j = 0; j < 2; ++j) { + hits[i+j]->hits[0].flag |= 2; + if (hits[i+j]->hits[0].k != a[j].k) + hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; + } + } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end + if (a[0].G && a[1].G) { // now we have two "proper pairs" + int G[2]; + double diff; + G[0] = hits[i]->hits[0].G + a[1].G; + G[1] = hits[i+1]->hits[0].G + a[0].G; + diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); + if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; + } + if (a[0].G == 0 || a[1].G == 0) { // one proper pair only + bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved + int which, isize; + double dev, diff; + if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; + else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; + isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; + dev = fabs(isize - pes.avg) / pes.std; + diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; + if (diff < dev * 2.) { // then move (heuristic) + a[which].G2 = a[which].G; + p[1][0] = a[which]; + p[1]->flag |= BSW2_FLAG_MOVED | 2; + p[0]->flag |= 2; + ++n_moved; + } + } + } else if (is_fixed) { + hits[i+0]->hits[0].flag |= 2; + hits[i+1]->hits[0].flag |= 2; + } + } + } + ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); + fputs(msg.s, stderr); + free(msg.s); +} diff --git a/src/bwa/example.c b/src/bwa/example.c new file mode 100644 index 000000000..4e8494d98 --- /dev/null +++ b/src/bwa/example.c @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include "bwamem.h" +#include "kseq.h" // for the FASTA/Q parser +KSEQ_DECLARE(gzFile) + +int main(int argc, char *argv[]) +{ + bwaidx_t *idx; + gzFile fp; + kseq_t *ks; + mem_opt_t *opt; + + if (argc < 3) { + fprintf(stderr, "Usage: bwamem-lite \n"); + return 1; + } + + idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index + if (NULL == idx) { + fprintf(stderr, "Index load failed.\n"); + exit(EXIT_FAILURE); + } + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[2], "-") ? argv[2] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks = kseq_init(fp); // initialize the FASTA/Q parser + opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values + + while (kseq_read(ks) >= 0) { // read one sequence + mem_alnreg_v ar; + int i, k; + ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits + for (i = 0; i < ar.n; ++i) { // traverse each hit + mem_aln_t a; + if (ar.a[i].secondary >= 0) continue; // skip secondary alignments + a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + // print alignment + printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); + for (k = 0; k < a.n_cigar; ++k) // print CIGAR + printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); + printf("\t%d\n", a.NM); // print edit distance + free(a.cigar); // don't forget to deallocate CIGAR + } + free(ar.a); // and deallocate the hit list + } + + free(opt); + kseq_destroy(ks); + gzclose(fp); + bwa_idx_destroy(idx); + return 0; +} diff --git a/src/bwa/fastmap.c b/src/bwa/fastmap.c new file mode 100644 index 000000000..dfb2e7e78 --- /dev/null +++ b/src/bwa/fastmap.c @@ -0,0 +1,526 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bwa.h" +#include "bwamem.h" +#include "kvec.h" +#include "utils.h" +#include "bntseq.h" +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +extern unsigned char nst_nt4_table[256]; + +void *kopen(const char *fn, int *_fd); +int kclose(void *a); +void kt_pipeline(int n_threads, void *(*func)(void*, int, void*, FILE*), void *shared_data, int n_steps, FILE *ostream); //JEREMIAH + +typedef struct { + kseq_t *ks, *ks2; + mem_opt_t *opt; + mem_pestat_t *pes0; + int64_t n_processed; + int copy_comment, actual_chunk_size; + bwaidx_t *idx; +} ktp_aux_t; + +typedef struct { + ktp_aux_t *aux; + int n_seqs; + bseq1_t *seqs; +} ktp_data_t; + +static void *process(void *shared, int step, void *_data, FILE *ostream) +{ + ktp_aux_t *aux = (ktp_aux_t*)shared; + ktp_data_t *data = (ktp_data_t*)_data; + int i; + if (step == 0) { + ktp_data_t *ret; + int64_t size = 0; + ret = calloc(1, sizeof(ktp_data_t)); + ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2); + if (ret->seqs == 0) { + free(ret); + return 0; + } + if (!aux->copy_comment) + for (i = 0; i < ret->n_seqs; ++i) { + free(ret->seqs[i].comment); + ret->seqs[i].comment = 0; + } + for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size); + return ret; + } else if (step == 1) { + const mem_opt_t *opt = aux->opt; + const bwaidx_t *idx = aux->idx; + if (opt->flag & MEM_F_SMARTPE) { + bseq1_t *sep[2]; + int n_sep[2]; + mem_opt_t tmp_opt = *opt; + bseq_classify(data->n_seqs, data->seqs, n_sep, sep); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); + if (n_sep[0]) { + tmp_opt.flag &= ~MEM_F_PE; + mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0); + for (i = 0; i < n_sep[0]; ++i) + data->seqs[sep[0][i].id].sam = sep[0][i].sam; + } + if (n_sep[1]) { + tmp_opt.flag |= MEM_F_PE; + mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); + for (i = 0; i < n_sep[1]; ++i) + data->seqs[sep[1][i].id].sam = sep[1][i].sam; + } + free(sep[0]); free(sep[1]); + } else { + //fprintf(stdout, "NSEQS: %d NAME: %s OPT->min_seed_len: %d aux->pes0->low: %d aux->pes0->high: %d\n", data->n_seqs, data->seqs[0].name, opt->min_seed_len); + //mem_pestat_t *fake; + //fprintf("SEQSEQ: %s\n", data->seqs[0].seq); + mem_process_seqs(opt /* mem_opt_t* */, idx->bwt, idx->bns, idx->pac, aux->n_processed /*int*/, data->n_seqs /*int*/, data->seqs /* bseq1_t* */, aux->pes0 /* mem_pestat_t* */); + //fprintf(stdout, "aux->pes0->low: %d\n", aux->pes0->low); + } + aux->n_processed += data->n_seqs; + return data; + } else if (step == 2) { + for (i = 0; i < data->n_seqs; ++i) { + if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, ostream); //JEREMIAH stdout -> ostream + free(data->seqs[i].name); free(data->seqs[i].comment); + free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam); + } + free(data->seqs); free(data); + return 0; + } + return 0; +} + +static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) +{ + if (opt0->a) { // matching score is changed + if (!opt0->b) opt->b *= opt->a; + if (!opt0->T) opt->T *= opt->a; + if (!opt0->o_del) opt->o_del *= opt->a; + if (!opt0->e_del) opt->e_del *= opt->a; + if (!opt0->o_ins) opt->o_ins *= opt->a; + if (!opt0->e_ins) opt->e_ins *= opt->a; + if (!opt0->zdrop) opt->zdrop *= opt->a; + if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a; + if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a; + if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a; + } +} + +int main_mem(int argc, char *argv[]) +{ + mem_opt_t *opt, opt0; + int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0; + int fixed_chunk_size = -1; + gzFile fp, fp2 = 0; + char *p, *rg_line = 0, *hdr_line = 0; + const char *mode = 0; + void *ko = 0, *ko2 = 0; + mem_pestat_t pes[4]; + ktp_aux_t aux; + + memset(&aux, 0, sizeof(ktp_aux_t)); + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + for (i = 0; i < 4; ++i) pes[i].failed = 1; + + aux.opt = opt = mem_opt_init(); + memset(&opt0, 0, sizeof(mem_opt_t)); + while ((c = getopt(argc, argv, "1epaFMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) { + if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; + else if (c == '1') no_mt_io = 1; + else if (c == 'x') mode = optarg; + else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; + else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; + else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; + else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; + else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; + else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; + else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; + else if (c == 'a') opt->flag |= MEM_F_ALL; + else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE; + else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; + else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; + else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; + else if (c == 'F') opt->flag |= MEM_F_ALN_REG; + else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP; + else if (c == 'V') opt->flag |= MEM_F_REF_HDR; + else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; + else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; + else if (c == 'v') bwa_verbose = atoi(optarg); + else if (c == 'j') ignore_alt = 1; + else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; + else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; + else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; + else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; + else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; + else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; + else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; + else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; + else if (c == 'C') aux.copy_comment = 1; + else if (c == 'K') fixed_chunk_size = atoi(optarg); + else if (c == 'X') opt->mask_level = atof(optarg); + else if (c == 'h') { + opt0.max_XA_hits = opt0.max_XA_hits_alt = 1; + opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->max_XA_hits_alt = strtol(p+1, &p, 10); + } + else if (c == 'Q') { + opt0.mapQ_coef_len = 1; + opt->mapQ_coef_len = atoi(optarg); + opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; + } else if (c == 'O') { + opt0.o_del = opt0.o_ins = 1; + opt->o_del = opt->o_ins = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->o_ins = strtol(p+1, &p, 10); + } else if (c == 'E') { + opt0.e_del = opt0.e_ins = 1; + opt->e_del = opt->e_ins = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->e_ins = strtol(p+1, &p, 10); + } else if (c == 'L') { + opt0.pen_clip5 = opt0.pen_clip3 = 1; + opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->pen_clip3 = strtol(p+1, &p, 10); + } else if (c == 'R') { + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 'H') { + if (optarg[0] != '@') { + FILE *fp; + if ((fp = fopen(optarg, "r")) != 0) { + char *buf; + buf = calloc(1, 0x10000); + while (fgets(buf, 0xffff, fp)) { + i = strlen(buf); + assert(buf[i-1] == '\n'); // a long line + buf[i-1] = 0; + hdr_line = bwa_insert_header(buf, hdr_line); + } + free(buf); + fclose(fp); + } + } else hdr_line = bwa_insert_header(optarg, hdr_line); + } else if (c == 'I') { // specify the insert size distribution + aux.pes0 = pes; + pes[1].failed = 0; + pes[1].avg = strtod(optarg, &p); + pes[1].std = pes[1].avg * .1; + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].std = strtod(p+1, &p); + pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499); + pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499); + if (pes[1].low < 1) pes[1].low = 1; + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].high = (int)(strtod(p+1, &p) + .499); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].low = (int)(strtod(p+1, &p) + .499); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", + __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); + } + else return 1; + } + + // JEREMIAH turn on soft clipping + opt->flag |= MEM_F_SOFTCLIP; + + + if (rg_line) { + hdr_line = bwa_insert_header(rg_line, hdr_line); + free(rg_line); + } + + if (opt->n_threads < 1) opt->n_threads = 1; + if (optind + 1 >= argc || optind + 3 < argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); + fprintf(stderr, "Algorithm options:\n\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); + fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv); +// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); + fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); + fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); + fprintf(stderr, " -S skip mate rescue\n"); + fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); + fprintf(stderr, " -e discard full-length exact matches\n"); + fprintf(stderr, "\nScoring options:\n\n"); + fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); + fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); + fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); + fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); + fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired); + fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); + fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n"); + fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n"); + fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n"); +// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n"); + fprintf(stderr, "\nInput/output options:\n\n"); + fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n"); + fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore .alt file)\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); + fprintf(stderr, " -h INT[,INT] if there are 80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -V output the reference FASTA header in the XR tag\n"); + fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); + fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); + fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); + fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); + fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); + fprintf(stderr, " FR orientation only. [inferred]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + if (mode) { + if (strcmp(mode, "intractg") == 0) { + if (!opt0.o_del) opt->o_del = 16; + if (!opt0.o_ins) opt->o_ins = 16; + if (!opt0.b) opt->b = 9; + if (!opt0.pen_clip5) opt->pen_clip5 = 5; + if (!opt0.pen_clip3) opt->pen_clip3 = 5; + } else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0 || strcmp(mode, "ont2d") == 0) { + if (!opt0.o_del) opt->o_del = 1; + if (!opt0.e_del) opt->e_del = 1; + if (!opt0.o_ins) opt->o_ins = 1; + if (!opt0.e_ins) opt->e_ins = 1; + if (!opt0.b) opt->b = 1; + if (opt0.split_factor == 0.) opt->split_factor = 10.; + if (strcmp(mode, "pbread") == 0) { // pacbio read-to-read setting; NOT working well! + opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; + if (!opt0.min_chain_weight) opt->min_chain_weight = 40; + if (!opt0.max_occ) opt->max_occ = 1000; + if (!opt0.min_seed_len) opt->min_seed_len = 13; + if (!opt0.max_chain_extend) opt->max_chain_extend = 25; + if (opt0.drop_ratio == 0.) opt->drop_ratio = .001; + } else if (strcmp(mode, "ont2d") == 0) { + if (!opt0.min_chain_weight) opt->min_chain_weight = 20; + if (!opt0.min_seed_len) opt->min_seed_len = 14; + if (!opt0.pen_clip5) opt->pen_clip5 = 0; + if (!opt0.pen_clip3) opt->pen_clip3 = 0; + } else { + if (!opt0.min_chain_weight) opt->min_chain_weight = 40; + if (!opt0.min_seed_len) opt->min_seed_len = 17; + if (!opt0.pen_clip5) opt->pen_clip5 = 0; + if (!opt0.pen_clip3) opt->pen_clip3 = 0; + } + } else { + fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); + return 1; // FIXME memory leak + } + } else update_a(opt, &opt0); + bwa_fill_scmat(opt->a, opt->b, opt->mat); + + aux.idx = bwa_idx_load_from_shm(argv[optind]); + if (aux.idx == 0) { + if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + } else if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__); + if (ignore_alt) + for (i = 0; i < aux.idx->bns->n_seqs; ++i) + aux.idx->bns->anns[i].is_alt = 0; + + ko = kopen(argv[optind + 1], &fd); + if (ko == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); + return 1; + } + fp = gzdopen(fd, "r"); + aux.ks = kseq_init(fp); + if (optind + 2 < argc) { + if (opt->flag&MEM_F_PE) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__); + } else { + ko2 = kopen(argv[optind + 2], &fd2); + if (ko2 == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]); + return 1; + } + fp2 = gzdopen(fd2, "r"); + aux.ks2 = kseq_init(fp2); + opt->flag |= MEM_F_PE; + } + } + if (!(opt->flag & MEM_F_ALN_REG)) + bwa_print_sam_hdr(aux.idx->bns, hdr_line); + aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; + + + //JEREMIAH + char* cbuffer = NULL; + size_t bufferSize = 0; + FILE* ostream = open_memstream(&cbuffer, &bufferSize); + + //kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3, ostream); + + // JEREMIAH + + int fake_nprocessed = 0; + int fake_nseqs = 1; + mem_pestat_t *fake_mem_pestat_t; + //fake_seqs = (bseq1_t*)malloc(fake_nseqs * sizeof(bseq1_t)); + + bseq1_t * fake_seqs = (bseq1_t*)malloc(sizeof(bseq1_t)); + bseq1_t * s1 = (bseq1_t*)malloc(sizeof(bseq1_t)); + s1->l_seq = 40; + s1->id = 0; + + s1->name = "myseq1"; + s1->seq = malloc(s1->l_seq * sizeof(char)); + strcpy(s1->seq, "AAAAAAAAAAAAATCTGAGCAAAAAAAAAAAAATCTGAGC"); + s1->qual = malloc(s1->l_seq * sizeof(char)); + strcpy(s1->qual,"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"); + +/* + bseq1_t s2; + s2.l_seq = 40; + s2.id = 0; + char * name2 = malloc(6 * sizeof(char)); + strcpy(name2, "myseq1"); + + char * name2 = malloc(6 * sizeof(char)); + char seq2[] = "TTCGGCTACGACTACGATCG"; + char qual2[] ="IIIIIIIIIIIIIIIIIIII"; + s2.name = name2; + s2.seq = seq2; + s2.qual = qual2; +*/ + fprintf(stdout, "BEFORE %s\n\n", s1->seq); + int kk; + for (kk = 0; kk < s1->l_seq; ++kk) + s1->seq[kk] = s1->seq[kk] < 4? s1->seq[kk] : nst_nt4_table[(int)s1->seq[kk]]; + fprintf(stdout, "AFTER %s\n\n", s1->seq); + + //fake_seqs = s1; + //bseq1_t fake_seqs[1] = { &s1 }; + mem_process_seqs(aux.opt, aux.idx->bwt, aux.idx->bns, aux.idx->pac, fake_nprocessed, fake_nseqs/*1*/, s1, fake_mem_pestat_t); + fprintf(stdout, "%s\n%s\n", s1->sam, s1->sam); + //mem_alnreg_v answer = mem_align1(aux.opt, aux.idx->bwt, aux.idx->bns, aux.idx->pac, s1.l_seq, s1.seq); + //mem_aln_t answer2 = mem_reg2aln(aux.opt, aux.idx->bns, aux.idx->pac, s1.l_seq, s1.seq, &answer); + //fprintf(stdout, "N: %d M %d score: %d, ref %d, pos %d pos %d\n", answer.n, answer.m, answer.a->score, answer.a->rid, answer.a->rb, answer.a->re); + + free(s1); + + + //JEREMIAH + //fclose(ostream); + //fprintf(stdout, "THIS IS THE FINAL THING:\n%s", cbuffer); + + free(hdr_line); + free(opt); + exit(1); + bwa_idx_destroy(aux.idx); + kseq_destroy(aux.ks); + err_gzclose(fp); kclose(ko); + + if (aux.ks2) { + kseq_destroy(aux.ks2); + err_gzclose(fp2); kclose(ko2); + } + return 0; +} + +int main_fastmap(int argc, char *argv[]) +{ + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX; + uint64_t max_intv = 0; + kseq_t *seq; + bwtint_t k; + gzFile fp; + smem_i *itr; + const bwtintv_v *a; + bwaidx_t *idx; + + while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) { + switch (c) { + case 'p': print_seq = 1; break; + case 'w': min_iwidth = atoi(optarg); break; + case 'l': min_len = atoi(optarg); break; + case 'i': min_intv = atoi(optarg); break; + case 'I': max_intv = atol(optarg); break; + case 'L': max_len = atoi(optarg); break; + default: return 1; + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa fastmap [options] \n\n"); + fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len); + fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth); + fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv); + fprintf(stderr, " -l INT max MEM length [%d]\n", max_len); + fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv); + fprintf(stderr, "\n"); + return 1; + } + + fp = xzopen(argv[optind + 1], "r"); + seq = kseq_init(fp); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; + itr = smem_itr_init(idx->bwt); + smem_config(itr, min_intv, max_len, max_intv); + while (kseq_read(seq) >= 0) { + err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); + if (print_seq) { + err_putchar('\t'); + err_puts(seq->seq.s); + } else err_putchar('\n'); + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); + while ((a = smem_next(itr)) != 0) { + for (i = 0; i < a->n; ++i) { + bwtintv_t *p = &a->a[i]; + if ((uint32_t)p->info - (p->info>>32) < min_len) continue; + err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if (p->x[2] <= min_iwidth) { + for (k = 0; k < p->x[2]; ++k) { + bwtint_t pos; + int len, is_rev, ref_id; + len = (uint32_t)p->info - (p->info>>32); + pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); + if (is_rev) pos -= len - 1; + bns_cnt_ambi(idx->bns, pos, len, &ref_id); + err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); + } + } else err_puts("\t*"); + err_putchar('\n'); + } + } + err_puts("//"); + } + + smem_itr_destroy(itr); + bwa_idx_destroy(idx); + kseq_destroy(seq); + err_gzclose(fp); + return 0; +} diff --git a/src/bwa/is.c b/src/bwa/is.c new file mode 100644 index 000000000..46f177245 --- /dev/null +++ b/src/bwa/is.c @@ -0,0 +1,223 @@ +/* + * sais.c for sais-lite + * Copyright (c) 2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef unsigned char ubyte_t; +#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) + +/* find the start or end of each bucket */ +static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) +{ + int i; + for (i = 0; i < k; ++i) C[i] = 0; + for (i = 0; i < n; ++i) ++C[chr(i)]; +} +static void getBuckets(const int *C, int *B, int k, int end) +{ + int i, sum = 0; + if (end) { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum; + } + } else { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum - C[i]; + } + } +} + +/* compute SA */ +static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) +{ + int *b, i, j; + int c0, c1; + /* compute SAl */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 0); /* find starts of buckets */ + j = n - 1; + b = SA + B[c1 = chr(j)]; + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + for (i = 0; i < n; ++i) { + j = SA[i], SA[i] = ~j; + if (0 < j) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + } + } + /* compute SAs */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { + if (0 < (j = SA[i])) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; + } else SA[i] = ~j; + } +} + +/* + * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working + * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet + */ +static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) +{ + int *C, *B, *RA; + int i, j, c, m, p, q, plen, qlen, name; + int c0, c1; + int diff; + + /* stage 1: reduce the problem by at least 1/2 sort all the + * S-substrings */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = 0; i < n; ++i) SA[i] = 0; + for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) SA[--B[c1]] = i + 1, c = 0; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + /* compact all the sorted substrings into the first m items of SA + * 2*m must be not larger than n (proveable) */ + for (i = 0, m = 0; i < n; ++i) { + p = SA[i]; + if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { + for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); + if ((j < n) && (c0 < c1)) SA[m++] = p; + } + } + for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ + /* store the length of all substrings */ + for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) { + SA[m + ((i + 1) >> 1)] = j - i - 1; + j = i + 1; + c = 0; + } + } + /* find the lexicographic names of all substrings */ + for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { + p = SA[i], plen = SA[m + (p >> 1)], diff = 1; + if (plen == qlen) { + for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); + if (j == plen) diff = 0; + } + if (diff != 0) ++name, q = p, qlen = plen; + SA[m + (p >> 1)] = name; + } + + /* stage 2: solve the reduced problem recurse if names are not yet + * unique */ + if (name < m) { + RA = SA + n + fs - m; + for (i = n - 1, j = m - 1; m <= i; --i) { + if (SA[i] != 0) RA[j--] = SA[i] - 1; + } + if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; + for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ + } + for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ + } + /* stage 3: induce the result for the original problem */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + /* put all left-most S characters into their buckets */ + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ + for (i = m - 1; 0 <= i; --i) { + j = SA[i], SA[i] = 0; + SA[--B[chr(j)]] = j; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + return 0; +} + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred + */ +int is_sa(const ubyte_t *T, int *SA, int n) +{ + if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; + SA[0] = n; + if (n <= 1) { + if (n == 1) SA[1] = 0; + return 0; + } + return sais_main(T, SA+1, 0, n, 256, 1); +} + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +int is_bwt(ubyte_t *T, int n) +{ + int *SA, i, primary = 0; + SA = (int*)calloc(n+1, sizeof(int)); + + if (is_sa(T, SA, n)) return -1; + + for (i = 0; i <= n; ++i) { + if (SA[i] == 0) primary = i; + else SA[i] = T[SA[i] - 1]; + } + for (i = 0; i < primary; ++i) T[i] = SA[i]; + for (; i < n; ++i) T[i] = SA[i + 1]; + free(SA); + return primary; +} diff --git a/src/bwa/kbtree.h b/src/bwa/kbtree.h new file mode 100644 index 000000000..0da101da1 --- /dev/null +++ b/src/bwa/kbtree.h @@ -0,0 +1,388 @@ +/*- + * Copyright 1997-1999, 2001, John-Mark Gurney. + * 2008-2009, Attractive Chaos + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __AC_KBTREE_H +#define __AC_KBTREE_H + +#include +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef struct { + int32_t is_internal:1, n:31; +} kbnode_t; + +#define __KB_KEY(type, x) ((type*)((char*)x + 4)) +#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) + +#define __KB_TREE_T(name) \ + typedef struct { \ + kbnode_t *root; \ + int off_key, off_ptr, ilen, elen; \ + int n, t; \ + int n_keys, n_nodes; \ + } kbtree_##name##_t; + +#define __KB_INIT(name, key_t) \ + kbtree_##name##_t *kb_init_##name(int size) \ + { \ + kbtree_##name##_t *b; \ + b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ + b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ + if (b->t < 2) { \ + free(b); return 0; \ + } \ + b->n = 2 * b->t - 1; \ + b->off_ptr = 4 + b->n * sizeof(key_t); \ + b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ + b->elen = (b->off_ptr + 3) >> 2 << 2; \ + b->root = (kbnode_t*)calloc(1, b->ilen); \ + ++b->n_nodes; \ + return b; \ + } + +#define __kb_destroy(b) do { \ + int i, max = 8; \ + kbnode_t *x, **top, **stack = 0; \ + if (b) { \ + top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ + *top++ = (b)->root; \ + while (top != stack) { \ + x = *--top; \ + if (x == 0 || x->is_internal == 0) { free(x); continue; } \ + for (i = 0; i <= x->n; ++i) \ + if (__KB_PTR(b, x)[i]) { \ + if (top - stack == max) { \ + max <<= 1; \ + stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ + top = stack + (max>>1); \ + } \ + *top++ = __KB_PTR(b, x)[i]; \ + } \ + free(x); \ + } \ + } \ + free(b); free(stack); \ + } while (0) + +#define __kb_get_first(key_t, b, ret) do { \ + kbnode_t *__x = (b)->root; \ + while (__KB_PTR(b, __x)[0] != 0) \ + __x = __KB_PTR(b, __x)[0]; \ + (ret) = __KB_KEY(key_t, __x)[0]; \ + } while (0) + +#define __KB_GET_AUX0(name, key_t, __cmp) \ + static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin, end, n = x->n >> 1; \ + if (x->n == 0) return -1; \ + if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ + begin = 0; end = n; \ + } else { begin = n; end = x->n - 1; } \ + rr = r? r : &tr; \ + n = end; \ + while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ + return n; \ + } + +#define __KB_GET_AUX1(name, key_t, __cmp) \ + static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin = 0, end = x->n; \ + if (x->n == 0) return -1; \ + rr = r? r : &tr; \ + while (begin < end) { \ + int mid = (begin + end) >> 1; \ + if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ + else end = mid; \ + } \ + if (begin == x->n) { *rr = 1; return x->n - 1; } \ + if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ + return begin; \ + } + +#define __KB_GET(name, key_t) \ + static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ + if (x->is_internal == 0) return 0; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + return 0; \ + } \ + static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_getp_##name(b, &k); \ + } + +#define __KB_INTERVAL(name, key_t) \ + static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + *lower = *upper = 0; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) { \ + *lower = *upper = &__KB_KEY(key_t, x)[i]; \ + return; \ + } \ + if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ + if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ + if (x->is_internal == 0) return; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + } \ + static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ + { \ + kb_intervalp_##name(b, &k, lower, upper); \ + } + +#define __KB_PUT(name, key_t, __cmp) \ + /* x must be an internal node */ \ + static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ + { \ + kbnode_t *z; \ + z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ + ++b->n_nodes; \ + z->is_internal = y->is_internal; \ + z->n = b->t - 1; \ + memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ + if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ + y->n = b->t - 1; \ + memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ + __KB_PTR(b, x)[i + 1] = z; \ + memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ + ++x->n; \ + } \ + static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ + { \ + int i = x->n - 1; \ + if (x->is_internal == 0) { \ + i = __kb_getp_aux_##name(x, k, 0); \ + if (i != x->n - 1) \ + memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + __KB_KEY(key_t, x)[i + 1] = *k; \ + ++x->n; \ + } else { \ + i = __kb_getp_aux_##name(x, k, 0) + 1; \ + if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ + __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ + if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ + } \ + __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ + } \ + } \ + static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *r, *s; \ + ++b->n_keys; \ + r = b->root; \ + if (r->n == 2 * b->t - 1) { \ + ++b->n_nodes; \ + s = (kbnode_t*)calloc(1, b->ilen); \ + b->root = s; s->is_internal = 1; s->n = 0; \ + __KB_PTR(b, s)[0] = r; \ + __kb_split_##name(b, s, 0, r); \ + r = s; \ + } \ + __kb_putp_aux_##name(b, r, k); \ + } \ + static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + kb_putp_##name(b, &k); \ + } + + +#define __KB_DEL(name, key_t) \ + static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ + { \ + int yn, zn, i, r = 0; \ + kbnode_t *xp, *y, *z; \ + key_t kp; \ + if (x == 0) return *k; \ + if (s) { /* s can only be 0, 1 or 2 */ \ + r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ + i = s == 1? x->n - 1 : -1; \ + } else i = __kb_getp_aux_##name(x, k, &r); \ + if (x->is_internal == 0) { \ + if (s == 2) ++i; \ + kp = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + --x->n; \ + return kp; \ + } \ + if (r == 0) { \ + if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ + return kp; \ + } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i + 1]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ + return kp; \ + } else if (yn == b->t - 1 && zn == b->t - 1) { \ + y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ + __KB_KEY(key_t, y)[y->n++] = *k; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ + y->n += z->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(z); \ + return __kb_delp_aux_##name(b, y, k, s); \ + } \ + } \ + ++i; \ + if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ + if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ + memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ + __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ + if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ + --y->n; ++xp->n; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ + if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ + --y->n; \ + memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ + } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ + __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + y->n += xp->n; \ + memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ + --x->n; \ + free(xp); \ + xp = y; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ + xp->n += y->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(y); \ + } \ + } \ + return __kb_delp_aux_##name(b, xp, k, s); \ + } \ + static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *x; \ + key_t ret; \ + ret = __kb_delp_aux_##name(b, b->root, k, 0); \ + --b->n_keys; \ + if (b->root->n == 0 && b->root->is_internal) { \ + --b->n_nodes; \ + x = b->root; \ + b->root = __KB_PTR(b, x)[0]; \ + free(x); \ + } \ + return ret; \ + } \ + static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_delp_##name(b, &k); \ + } + +typedef struct { + kbnode_t *x; + int i; +} __kbstack_t; + +#define __kb_traverse(key_t, b, __func) do { \ + int __kmax = 8; \ + __kbstack_t *__kstack, *__kp; \ + __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ + __kp->x = (b)->root; __kp->i = 0; \ + for (;;) { \ + while (__kp->x && __kp->i <= __kp->x->n) { \ + if (__kp - __kstack == __kmax - 1) { \ + __kmax <<= 1; \ + __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ + __kp = __kstack + (__kmax>>1) - 1; \ + } \ + (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ + ++__kp; \ + } \ + --__kp; \ + if (__kp >= __kstack) { \ + if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ + ++__kp->i; \ + } else break; \ + } \ + free(__kstack); \ + } while (0) + +#define KBTREE_INIT(name, key_t, __cmp) \ + __KB_TREE_T(name) \ + __KB_INIT(name, key_t) \ + __KB_GET_AUX1(name, key_t, __cmp) \ + __KB_GET(name, key_t) \ + __KB_INTERVAL(name, key_t) \ + __KB_PUT(name, key_t, __cmp) \ + __KB_DEL(name, key_t) + +#define KB_DEFAULT_SIZE 512 + +#define kbtree_t(name) kbtree_##name##_t +#define kb_init(name, s) kb_init_##name(s) +#define kb_destroy(name, b) __kb_destroy(b) +#define kb_get(name, b, k) kb_get_##name(b, k) +#define kb_put(name, b, k) kb_put_##name(b, k) +#define kb_del(name, b, k) kb_del_##name(b, k) +#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) +#define kb_getp(name, b, k) kb_getp_##name(b, k) +#define kb_putp(name, b, k) kb_putp_##name(b, k) +#define kb_delp(name, b, k) kb_delp_##name(b, k) +#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) + +#define kb_size(b) ((b)->n_keys) + +#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) +#define kb_str_cmp(a, b) strcmp(a, b) + +#endif diff --git a/src/bwa/khash.h b/src/bwa/khash.h new file mode 100644 index 000000000..12e554246 --- /dev/null +++ b/src/bwa/khash.h @@ -0,0 +1,614 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.6" + +#include +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +/* compipler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + inc) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + inc = __ac_inc(k, mask); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + inc) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/src/bwa/kopen.c b/src/bwa/kopen.c new file mode 100644 index 000000000..d238226c1 --- /dev/null +++ b/src/bwa/kopen.c @@ -0,0 +1,374 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#include +#endif + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#ifdef _WIN32 +#define _KO_NO_NET +#endif + +#ifndef _KO_NO_NET +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +#undef __err_connect +} + +static int write_bytes(int fd, const char *buf, size_t len) +{ + ssize_t bytes; + do { + bytes = write(fd, buf, len); + if (bytes >= 0) { + len -= bytes; + } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { + return -1; + } + } while (len > 0); + + return 0; +} + +static int http_open(const char *fn) +{ + char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; + int fd, ret, l; + ssize_t bytes = 0, bufsz = 0x10000; + + /* parse URL; adapted from khttp_parse_url() in knetfile.c */ + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + http_host = calloc(l + 1, 1); + strncpy(http_host, fn + 7, l); + http_host[l] = 0; + for (q = http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set host, port and path + if (proxy == 0) { + host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. + port = strdup(*q? q : "80"); + path = strdup(*p? p : "/"); + } else { + host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + port = strdup(*q? q : "80"); + path = strdup(fn); + } + + /* connect; adapted from khttp_connect() in knetfile.c */ + l = 0; + fd = socket_connect(host, port); + buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n", + path, http_host); + if (write_bytes(fd, buf, l) != 0) { + close(fd); + fd = -1; + goto out; + } + l = 0; + retry: + while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry; + + buf[l] = 0; + if (bytes < 0 || l < 14) { // prematured header + close(fd); + fd = -1; + goto out; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret != 200) { + close(fd); + fd = -1; + } + out: + free(buf); free(http_host); free(host); free(port); free(path); + return fd; +} + +typedef struct { + int max_response, ctrl_fd; + char *response; +} ftpaux_t; + +static int kftp_get_response(ftpaux_t *aux) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; + while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + if (n >= aux->max_response) { + aux->max_response = aux->max_response? aux->max_response<<1 : 256; + aux->response = realloc(aux->response, aux->max_response); + } + aux->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) + && aux->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + aux->response[n-2] = 0; + return strtol(aux->response, &p, 0); +} + +static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) +{ + if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1; + return is_get? kftp_get_response(aux) : 0; +} + +static int ftp_open(const char *fn) +{ + char *p, *host = 0, *port = 0, *retr = 0; + char host2[80], port2[10]; + int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; + ftpaux_t aux; + + /* parse URL */ + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + port = strdup("21"); + host = calloc(l + 1, 1); + strncpy(host, fn + 6, l); + retr = calloc(strlen(p) + 8, 1); + sprintf(retr, "RETR %s\r\n", p); + + /* connect to ctrl */ + memset(&aux, 0, sizeof(ftpaux_t)); + aux.ctrl_fd = socket_connect(host, port); + if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ + + /* connect to the data stream */ + kftp_get_response(&aux); + kftp_send_cmd(&aux, "USER anonymous\r\n", 1); + kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); + kftp_send_cmd(&aux, "TYPE I\r\n", 1); + kftp_send_cmd(&aux, "PASV\r\n", 1); + for (p = aux.response; *p && *p != '('; ++p); + if (*p != '(') goto ftp_open_end; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(pasv_ip, v, 4 * sizeof(int)); + pasv_port = (v[4]<<8&0xff00) + v[5]; + kftp_send_cmd(&aux, retr, 0); + sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); + sprintf(port2, "%d", pasv_port); + fd = socket_connect(host2, port2); + if (fd == -1) goto ftp_open_end; + ret = kftp_get_response(&aux); + if (ret != 150) { + close(fd); + fd = -1; + } + close(aux.ctrl_fd); + +ftp_open_end: + free(host); free(port); free(retr); free(aux.response); + return fd; +} +#endif /* !defined(_KO_NO_NET) */ + +static char **cmd2argv(const char *cmd) +{ + int i, beg, end, argc; + char **argv, *str; + end = strlen(cmd); + for (i = end - 1; i >= 0; --i) + if (!isspace(cmd[i])) break; + end = i + 1; + for (beg = 0; beg < end; ++beg) + if (!isspace(cmd[beg])) break; + if (beg == end) return 0; + for (i = beg + 1, argc = 0; i < end; ++i) + if (isspace(cmd[i]) && !isspace(cmd[i-1])) + ++argc; + argv = (char**)calloc(argc + 2, sizeof(void*)); + argv[0] = str = (char*)calloc(end - beg + 1, 1); + strncpy(argv[0], cmd + beg, end - beg); + for (i = argc = 1; i < end - beg; ++i) + if (isspace(str[i])) str[i] = 0; + else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; + return argv; +} + +#define KO_STDIN 1 +#define KO_FILE 2 +#define KO_PIPE 3 +#define KO_HTTP 4 +#define KO_FTP 5 + +typedef struct { + int type, fd; + pid_t pid; +} koaux_t; + +void *kopen(const char *fn, int *_fd) +{ + koaux_t *aux = 0; + *_fd = -1; + if (strstr(fn, "http://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_HTTP; + aux->fd = http_open(fn); + } else if (strstr(fn, "ftp://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FTP; + aux->fd = ftp_open(fn); + } else if (strcmp(fn, "-") == 0) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_STDIN; + aux->fd = STDIN_FILENO; + } else { + const char *p, *q; + for (p = fn; *p; ++p) + if (!isspace(*p)) break; + if (*p == '<') { // pipe open + int need_shell, pfd[2]; + pid_t pid; + // a simple check to see if we need to invoke a shell; not always working + for (q = p + 1; *q; ++q) + if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') + break; + need_shell = (*q != 0); + if (pipe(pfd) != 0) return 0; + pid = vfork(); + if (pid == -1) { /* vfork() error */ + close(pfd[0]); close(pfd[1]); + return 0; + } + if (pid == 0) { /* the child process */ + char **argv; /* FIXME: I do not know if this will lead to a memory leak */ + close(pfd[0]); + dup2(pfd[1], STDOUT_FILENO); + close(pfd[1]); + if (!need_shell) { + argv = cmd2argv(p + 1); + execvp(argv[0], argv); + free(argv[0]); free(argv); + } else execl("/bin/sh", "sh", "-c", p + 1, NULL); + exit(1); + } else { /* parent process */ + close(pfd[1]); + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_PIPE; + aux->fd = pfd[0]; + aux->pid = pid; + } + } else { +#ifdef _WIN32 + *_fd = open(fn, O_RDONLY | O_BINARY); +#else + *_fd = open(fn, O_RDONLY); +#endif + if (*_fd >= 0) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FILE; + aux->fd = *_fd; + } + } + } + if (aux) *_fd = aux->fd; + return aux; +} + +int kclose(void *a) +{ + koaux_t *aux = (koaux_t*)a; + if (aux->type == KO_PIPE) { + int status; + pid_t pid; + pid = waitpid(aux->pid, &status, WNOHANG); + if (pid != aux->pid) kill(aux->pid, 15); + } + free(aux); + return 0; +} + +#ifdef _KO_MAIN +#define BUF_SIZE 0x10000 +int main(int argc, char *argv[]) +{ + void *x; + int l, fd; + unsigned char buf[BUF_SIZE]; + FILE *fp; + if (argc == 1) { + fprintf(stderr, "Usage: kopen \n"); + return 1; + } + x = kopen(argv[1], &fd); + fp = fdopen(fd, "r"); + if (fp == 0) { + fprintf(stderr, "ERROR: fail to open the input\n"); + return 1; + } + do { + if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) + fwrite(buf, 1, l, stdout); + } while (l == BUF_SIZE); + fclose(fp); + kclose(x); + return 0; +} +#endif diff --git a/src/bwa/kseq.h b/src/bwa/kseq.h new file mode 100644 index 000000000..f3862c6ff --- /dev/null +++ b/src/bwa/kseq.h @@ -0,0 +1,239 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1;} \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + int gotany = 0; \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + gotany = 1; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (!gotany && ks_eof(ks)) return -1; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/src/bwa/ksort.h b/src/bwa/ksort.h new file mode 100644 index 000000000..5851b0d97 --- /dev/null +++ b/src/bwa/ksort.h @@ -0,0 +1,273 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/src/bwa/kstring.c b/src/bwa/kstring.c new file mode 100644 index 000000000..2871310e8 --- /dev/null +++ b/src/bwa/kstring.c @@ -0,0 +1,39 @@ +#include +#include +#include "kstring.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + ksprintf(s, "abcdefg: %d", 100); + printf("%s\n", s->s); + free(s); + return 0; +} +#endif diff --git a/src/bwa/kstring.h b/src/bwa/kstring.h new file mode 100644 index 000000000..fe7fa95a7 --- /dev/null +++ b/src/bwa/kstring.h @@ -0,0 +1,115 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +int ksprintf(kstring_t *s, const char *fmt, ...); + +#endif diff --git a/src/bwa/ksw.c b/src/bwa/ksw.c new file mode 100644 index 000000000..9793e5eb4 --- /dev/null +++ b/src/bwa/ksw.c @@ -0,0 +1,713 @@ +/* The MIT License + + Copyright (c) 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; + +struct _kswq_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; + +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + kswq_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + oe_del = _mm_set1_epi8(_o_del + _e_del); + e_del = _mm_set1_epi8(_e_del); + oe_ins = _mm_set1_epi8(_o_ins + _e_ins); + e_ins = _mm_set1_epi8(_e_ins); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, cmp, imax; + __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del + t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del + e = _mm_max_epu8(e, t); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, e_ins); + t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins + f = _mm_max_epu8(f, t); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, oe_ins); + f = _mm_subs_epu8(f, e_ins); + cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + if (UNLIKELY(cmp == 0xffff)) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + if (imax >= minsc) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255 || gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score + int max = -1, tmp, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; + else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; + //printf("%d,%d\n", max, gmax); + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + return r; +} + +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + oe_del = _mm_set1_epi16(_o_del + _e_del); + e_del = _mm_set1_epi16(_e_del); + oe_ins = _mm_set1_epi16(_o_ins + _e_ins); + e_ins = _mm_set1_epi16(_e_ins); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + e = _mm_subs_epu16(e, e_del); + t = _mm_subs_epu16(h, oe_del); + e = _mm_max_epi16(e, t); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, e_ins); + t = _mm_subs_epu16(h, oe_ins); + f = _mm_max_epi16(f, t); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, oe_ins); + f = _mm_subs_epu16(f, e_ins); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + if (imax >= minsc) { + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; + } + r.score = gmax; r.te = te; + { + int max = -1, tmp, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + return r; +} + +static inline void revseq(int l, uint8_t *s) +{ + int i, t; + for (i = 0; i < l>>1; ++i) + t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; +} + +kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry) +{ + int size; + kswq_t *q; + kswr_t r, rr; + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int); + + q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + if (qry && *qry == 0) *qry = q; + func = q->size == 2? ksw_i16 : ksw_u8; + size = q->size; + r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra); + if (qry == 0) free(q); + if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; + revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end + q = ksw_qinit(size, r.qe + 1, query, m, mat); + rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score); + revseq(r.qe + 1, query); revseq(r.te + 1, target); + free(q); + if (r.score == rr.score) + r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; + return r; +} + +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry); +} + +/******************** + *** SW extension *** + ********************/ + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +{ + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + assert(h0 > 0); + // allocate memory + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) + eh[j].h = eh[j-1].h - e_ins; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; + beg = 0, end = qlen; + for (i = 0; LIKELY(i < tlen); ++i) { + int t, f = 0, h1, m = 0, mj = -1; + int8_t *q = &qp[target[i] * qlen]; + // apply the band and the constraint (if provided) + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; + if (end > qlen) end = qlen; + // compute the first column + if (beg == 0) { + h1 = h0 - (o_del + e_del * (i + 1)); + if (h1 < 0) h1 = 0; + } else h1 = 0; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M" + h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 + h = h > f? h : f; + h1 = h; // save H(i,j) to h1 for the next column + mj = m > h? mj : j; // record the position where max score is achieved + m = m > h? m : h; // m is stored at eh[mj+1] + t = M - oe_del; + t = t > 0? t : 0; + e -= e_del; + e = e > t? e : t; // computed E(i+1,j) + p->e = e; // save E(i+1,j) for the next row + t = M - oe_ins; + t = t > 0? t : 0; + f -= e_ins; + f = f > t? f : t; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } + if (m == 0) break; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } else if (zdrop > 0) { + if (i - max_i > mj - max_j) { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; + } + } + // update beg and end for the next round + for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); + beg = j; + for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); + end = j + 2 < qlen? j + 2 : qlen; + //beg = 0; end = qlen; // uncomment this line for debugging + } + free(eh); free(qp); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} + +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) +{ + return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); +} + +/******************** + * Global alignment * + ********************/ + +#define MINUS_INF -0x40000000 + +static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) +{ + if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { + if (*n_cigar == *m_cigar) { + *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; + cigar = realloc(cigar, (*m_cigar) << 2); + } + cigar[(*n_cigar)++] = len<<4 | op; + } else cigar[(*n_cigar)-1] += len<<4; + return cigar; +} + +int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_) +{ + eh_t *eh; + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col; + uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex + if (n_cigar_) *n_cigar_ = 0; + // allocate memory + n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix + z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0; + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = 0; eh[0].e = MINUS_INF; + for (j = 1; j <= qlen && j <= w; ++j) + eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band + // DP loop + for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop + int32_t f = MINUS_INF, h1, beg, end, t; + int8_t *q = &qp[target[i] * qlen]; + beg = i > w? i - w : 0; + end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence + h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF; + if (n_cigar_ && cigar_) { + uint8_t *zi = &z[(long)i * n_col]; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Cells are computed in the following order: + // M(i,j) = H(i-1,j-1) + S(i,j) + // H(i,j) = max{M(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape + // We have to separate M(i,j); otherwise the direction may not be recorded correctly. + // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global(). + // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k. + // In practice, this should happen very rarely given a reasonable scoring system. + eh_t *p = &eh[j]; + int32_t h, m = p->h, e = p->e; + uint8_t d; // direction + p->h = h1; + m += q[j]; + d = m >= e? 0 : 1; + h = m >= e? m : e; + d = h >= f? d : 2; + h = h >= f? h : f; + h1 = h; + t = m - oe_del; + e -= e_del; + d |= e > t? 1<<2 : 0; + e = e > t? e : t; + p->e = e; + t = m - oe_ins; + f -= e_ins; + d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two + f = f > t? f : t; + zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell + } + } else { + for (j = beg; LIKELY(j < end); ++j) { + eh_t *p = &eh[j]; + int32_t h, m = p->h, e = p->e; + p->h = h1; + m += q[j]; + h = m >= e? m : e; + h = h >= f? h : f; + h1 = h; + t = m - oe_del; + e -= e_del; + e = e > t? e : t; + p->e = e; + t = m - oe_ins; + f -= e_ins; + f = f > t? f : t; + } + } + eh[end].h = h1; eh[end].e = MINUS_INF; + } + score = eh[qlen].h; + if (n_cigar_ && cigar_) { // backtrack + int n_cigar = 0, m_cigar = 0, which = 0; + uint32_t *cigar = 0, tmp; + i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell + while (i >= 0 && k >= 0) { + which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; + if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; + else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; + else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; + } + if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *n_cigar_ = n_cigar, *cigar_ = cigar; + } + free(eh); free(qp); free(z); + return score; +} + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) +{ + return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_); +} + +/******************************************* + * Main function (not compiled by default) * + *******************************************/ + +#ifdef _KSW_MAIN + +#include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, err_gzread) + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +int main(int argc, char *argv[]) +{ + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; + int8_t mat[25]; + int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; + uint8_t *rseq = 0; + gzFile fpt, fpq; + kseq_t *kst, *ksq; + + // parse command line + while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { + switch (c) { + case 'a': sa = atoi(optarg); break; + case 'b': sb = atoi(optarg); break; + case 'q': gapo = atoi(optarg); break; + case 'r': gape = atoi(optarg); break; + case 't': minsc = atoi(optarg); break; + case 'f': forward_only = 1; break; + case '1': xtra |= KSW_XBYTE; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); + return 1; + } + if (minsc > 0xffff) minsc = 0xffff; + xtra |= KSW_XSUBO | minsc; + // initialize scoring matrix + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? sa : -sb; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; + // open file + fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt); + fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); + // all-pair alignment + while (kseq_read(ksq) > 0) { + kswq_t *q[2] = {0, 0}; + kswr_t r; + for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; + if (!forward_only) { // reverse + if ((int)ksq->seq.m > max_rseq) { + max_rseq = ksq->seq.m; + rseq = (uint8_t*)realloc(rseq, max_rseq); + } + for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) + rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + } + gzrewind(fpt); kseq_rewind(kst); + while (kseq_read(kst) > 0) { + for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); + if (r.score >= minsc) + err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); + if (rseq) { + r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); + if (r.score >= minsc) + err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); + } + } + free(q[0]); free(q[1]); + } + free(rseq); + kseq_destroy(kst); err_gzclose(fpt); + kseq_destroy(ksq); err_gzclose(fpq); + return 0; +} +#endif diff --git a/src/bwa/ksw.h b/src/bwa/ksw.h new file mode 100644 index 000000000..5d45a6744 --- /dev/null +++ b/src/bwa/ksw.h @@ -0,0 +1,114 @@ +#ifndef __AC_KSW_H +#define __AC_KSW_H + +#include + +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 + +struct _kswq_t; +typedef struct _kswq_t kswq_t; + +typedef struct { + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Aligning two sequences + * + * @param qlen length of the query sequence (typically =0, *gscore keeps the best score such that + * the entire query sequence is aligned; *gtle keeps the position on the + * target where *gscore is achieved. Returning *gscore and *gtle helps the + * caller to decide whether an end-to-end hit or a partial hit is preferred. + * + * The first 9 parameters are identical to those in ksw_global() + * + * @param h0 alignment score of upstream sequences + * @param _qle (out) length of the query in the alignment + * @param _tle (out) length of the target in the alignment + * @param _gtle (out) length of the target if query is fully aligned + * @param _gscore (out) score of the best end-to-end alignment; negative if not found + * + * @return best semi-local alignment score + */ + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/bwa/kthread.c b/src/bwa/kthread.c new file mode 100644 index 000000000..062bba3f8 --- /dev/null +++ b/src/bwa/kthread.c @@ -0,0 +1,148 @@ +#include +#include +#include +#include + +/************ + * kt_for() * + ************/ + +struct kt_for_t; + +typedef struct { + struct kt_for_t *t; + long i; +} ktf_worker_t; + +typedef struct kt_for_t { + int n_threads; + long n; + ktf_worker_t *w; + void (*func)(void*,long,int); + void *data; +} kt_for_t; + +static inline long steal_work(kt_for_t *t) +{ + int i, min_i = -1; + long k, min = LONG_MAX; + for (i = 0; i < t->n_threads; ++i) + if (min > t->w[i].i) min = t->w[i].i, min_i = i; + k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); + return k >= t->n? -1 : k; +} + +static void *ktf_worker(void *data) +{ + ktf_worker_t *w = (ktf_worker_t*)data; + long i; + for (;;) { + i = __sync_fetch_and_add(&w->i, w->t->n_threads); + if (i >= w->t->n) break; + w->t->func(w->t->data, i, w - w->t->w); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func(w->t->data, i, w - w->t->w); + pthread_exit(0); +} + +void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) +{ + int i; + kt_for_t t; + pthread_t *tid; + t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); + tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i = i; + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); +} + +/***************** + * kt_pipeline() * + *****************/ + +struct ktp_t; + +typedef struct { + struct ktp_t *pl; + int step, running; + void *data; +} ktp_worker_t; + +typedef struct ktp_t { + void *shared; + void *(*func)(void*, int, void*, FILE*); //JEREMIAH + FILE *ostream; //JEREMIAH + int n_workers, n_steps; + ktp_worker_t *workers; + pthread_mutex_t mutex; + pthread_cond_t cv; +} ktp_t; + +static void *ktp_worker(void *data) +{ + ktp_worker_t *w = (ktp_worker_t*)data; + ktp_t *p = w->pl; + while (w->step < p->n_steps) { + // test whether we can kick off the job with this worker + pthread_mutex_lock(&p->mutex); + for (;;) { + int i; + // test whether another worker is doing the same step + for (i = 0; i < p->n_workers; ++i) { + if (w == &p->workers[i]) continue; // ignore itself + if (p->workers[i].running && p->workers[i].step == w->step) + break; + } + if (i == p->n_workers) break; // no other workers doing w->step; then this worker will + pthread_cond_wait(&p->cv, &p->mutex); + } + w->running = 1; + pthread_mutex_unlock(&p->mutex); + + // working on w->step + //JEREMIAH + w->data = p->func(p->shared, w->step, w->step? w->data : 0, p->ostream); // for the first step, input is NULL + + // update step and let other workers know + pthread_mutex_lock(&p->mutex); + w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps; + w->running = 0; + pthread_cond_broadcast(&p->cv); + pthread_mutex_unlock(&p->mutex); + } + pthread_exit(0); +} + +//JEREMIAH added FILE* +void kt_pipeline(int n_threads, void *(*func)(void*, int, void*, FILE*), void *shared_data, int n_steps, FILE *ostream) +{ + ktp_t aux; + pthread_t *tid; + int i; + fprintf(stdout, "KTPIPELINE\n"); + if (n_threads < 1) n_threads = 1; + aux.n_workers = n_threads; + aux.n_steps = n_steps; + aux.func = func; + aux.ostream = ostream; //JEREMIAH + aux.shared = shared_data; + pthread_mutex_init(&aux.mutex, 0); + pthread_cond_init(&aux.cv, 0); + + aux.workers = alloca(n_threads * sizeof(ktp_worker_t)); + for (i = 0; i < n_threads; ++i) { + ktp_worker_t *w = &aux.workers[i]; + w->step = w->running = 0; w->pl = &aux; w->data = 0; + } + + tid = alloca(n_threads * sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + + pthread_mutex_destroy(&aux.mutex); + pthread_cond_destroy(&aux.cv); +} diff --git a/src/bwa/kvec.h b/src/bwa/kvec.h new file mode 100644 index 000000000..83ad483a6 --- /dev/null +++ b/src/bwa/kvec.h @@ -0,0 +1,94 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + (v).a[(v).n++] = (x); \ + } while (0) + +#define kv_pushp(type, v) ((((v).n == (v).m)? \ + ((v).m = ((v).m? (v).m<<1 : 2), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : 0), &(v).a[(v).n++]) + +#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ + : 0), (v).a[(i)]) + +#endif diff --git a/src/bwa/main.c b/src/bwa/main.c new file mode 100644 index 000000000..9f776d42d --- /dev/null +++ b/src/bwa/main.c @@ -0,0 +1,102 @@ +#include +#include +#include "kstring.h" +#include "utils.h" + +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.7.12-r1039" +#endif + +int bwa_fa2pac(int argc, char *argv[]); +int bwa_pac2bwt(int argc, char *argv[]); +int bwa_bwtupdate(int argc, char *argv[]); +int bwa_bwt2sa(int argc, char *argv[]); +int bwa_index(int argc, char *argv[]); +int bwt_bwtgen_main(int argc, char *argv[]); + +int bwa_aln(int argc, char *argv[]); +int bwa_sai2sam_se(int argc, char *argv[]); +int bwa_sai2sam_pe(int argc, char *argv[]); + +int bwa_bwtsw2(int argc, char *argv[]); + +int main_fastmap(int argc, char *argv[]); +int main_mem(int argc, char *argv[]); +int main_shm(int argc, char *argv[]); + +int main_pemerge(int argc, char *argv[]); + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li \n\n"); + fprintf(stderr, "Usage: bwa [options]\n\n"); + fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); + fprintf(stderr, " aln gapped/ungapped alignment\n"); + fprintf(stderr, " samse generate alignment (single ended)\n"); + fprintf(stderr, " sampe generate alignment (paired ended)\n"); + fprintf(stderr, " bwasw BWA-SW for long queries\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " shm manage indices in shared memory\n"); + fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); + fprintf(stderr, " pac2bwt generate BWT from PAC\n"); + fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); + fprintf(stderr, " bwtupdate update .bwt to the new format\n"); + fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); + fprintf(stderr, "\n"); + fprintf(stderr, +"Note: To use BWA, you need to first index the genome with `bwa index'.\n" +" There are three alignment algorithms in BWA: `mem', `bwasw', and\n" +" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n" +" first. Please `man ./bwa.1' for the manual.\n\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + extern char *bwa_pg; + int i, ret; + double t_real; + kstring_t pg = {0,0,0}; + t_real = realtime(); + ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); + for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); + bwa_pg = pg.s; + if (argc < 2) return usage(); + if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); + else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); + else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); + else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); + else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); + else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); + else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1); + else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + err_fflush(stdout); + err_fclose(stdout); + if (ret == 0) { + fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); + fprintf(stderr, "[%s] CMD:", __func__); + for (i = 0; i < argc; ++i) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); + } + free(bwa_pg); + return ret; +} diff --git a/src/bwa/malloc_wrap.c b/src/bwa/malloc_wrap.c new file mode 100644 index 000000000..100b8cb6e --- /dev/null +++ b/src/bwa/malloc_wrap.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#ifdef USE_MALLOC_WRAPPERS +/* Don't wrap ourselves */ +# undef USE_MALLOC_WRAPPERS +#endif +#include "malloc_wrap.h" + +void *wrap_calloc(size_t nmemb, size_t size, + const char *file, unsigned int line, const char *func) { + void *p = calloc(nmemb, size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, nmemb * size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +void *wrap_malloc(size_t size, + const char *file, unsigned int line, const char *func) { + void *p = malloc(size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +void *wrap_realloc(void *ptr, size_t size, + const char *file, unsigned int line, const char *func) { + void *p = realloc(ptr, size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +char *wrap_strdup(const char *s, + const char *file, unsigned int line, const char *func) { + char *p = strdup(s); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, strlen(s), file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} diff --git a/src/bwa/malloc_wrap.h b/src/bwa/malloc_wrap.h new file mode 100644 index 000000000..a55876a13 --- /dev/null +++ b/src/bwa/malloc_wrap.h @@ -0,0 +1,47 @@ +#ifndef MALLOC_WRAP_H +#define MALLOC_WRAP_H + +#include /* Avoid breaking the usual definitions */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + + void *wrap_calloc(size_t nmemb, size_t size, + const char *file, unsigned int line, const char *func); + void *wrap_malloc(size_t size, + const char *file, unsigned int line, const char *func); + void *wrap_realloc(void *ptr, size_t size, + const char *file, unsigned int line, const char *func); + char *wrap_strdup(const char *s, + const char *file, unsigned int line, const char *func); + +#ifdef __cplusplus +} +#endif + +#ifdef USE_MALLOC_WRAPPERS +# ifdef calloc +# undef calloc +# endif +# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__) + +# ifdef malloc +# undef malloc +# endif +# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__) + +# ifdef realloc +# undef realloc +# endif +# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__) + +# ifdef strdup +# undef strdup +# endif +# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__) + +#endif /* USE_MALLOC_WRAPPERS */ + +#endif /* MALLOC_WRAP_H */ diff --git a/src/bwa/pemerge.c b/src/bwa/pemerge.c new file mode 100644 index 000000000..725885f1c --- /dev/null +++ b/src/bwa/pemerge.c @@ -0,0 +1,291 @@ +#include +#include +#include +#include +#include +#include +#include +#include "ksw.h" +#include "kseq.h" +#include "kstring.h" +#include "bwa.h" +#include "utils.h" +KSEQ_DECLARE(gzFile) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define MAX_SCORE_RATIO 0.9f +#define MAX_ERR 8 + +static const char *err_msg[MAX_ERR+1] = { + "successful merges", + "low-scoring pairs", + "pairs where the best SW alignment is not an overlap (long left end)", + "pairs where the best SW alignment is not an overlap (long right end)", + "pairs with large 2nd best SW score", + "pairs with gapped overlap", + "pairs where the end-to-end alignment is inconsistent with SW", + "pairs potentially with tandem overlaps", + "pairs with high sum of errors" +}; + +typedef struct { + int a, b, q, r, w; + int q_def, q_thres; + int T; + int chunk_size; + int n_threads; + int flag; // bit 1: print merged; 2: print unmerged + int8_t mat[25]; +} pem_opt_t; + +pem_opt_t *pem_opt_init() +{ + pem_opt_t *opt; + opt = calloc(1, sizeof(pem_opt_t)); + opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; + opt->T = opt->a * 10; + opt->q_def = 20; + opt->q_thres = 70; + opt->chunk_size = 10000000; + opt->n_threads = 1; + opt->flag = 3; + bwa_fill_scmat(opt->a, opt->b, opt->mat); + return opt; +} + +int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) +{ + uint8_t *s[2], *q[2], *seq, *qual; + int i, xtra, l, l_seq, sum_q, ret = 0; + kswr_t r; + + s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); + s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); + for (i = 0; i < x[0].l_seq; ++i) { + int c = x[0].seq[i]; + s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; + q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; + } + for (i = 0; i < x[1].l_seq; ++i) { + int c = x[1].seq[x[1].l_seq - 1 - i]; + c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; + s[1][i] = c < 4? 3 - c : 4; + q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; + } + + xtra = KSW_XSTART | KSW_XSUBO; + r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + ++r.qe; ++r.te; // change to the half-close-half-open coordinates + + if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment + if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end + if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps + + { // test tandem match; O(n^2) + int max_m, max_m2, min_l, max_l, max_l2; + max_m = max_m2 = 0; max_l = max_l2 = 0; + min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; + for (l = 1; l < min_l; ++l) { + int m = 0, o = x[0].l_seq - l; + uint8_t *s0o = &s[0][o], *s1 = s[1]; + for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! + m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] + if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; + else if (m > max_m2) max_m2 = m, max_l2 = l; + } + if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { + ret = -7; goto pem_ret; + } + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } + } + + l = x[0].l_seq - (r.tb - r.qb); // length to merge + l_seq = x[0].l_seq + x[1].l_seq - l; + seq = malloc(l_seq + 1); + qual = malloc(l_seq + 1); + memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); + memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); + for (i = 0, sum_q = 0; i < l; ++i) { + int k = x[0].l_seq - l + i; + if (s[0][k] == 4) { // ambiguous + seq[k] = s[1][i]; + qual[k] = q[1][i]; + } else if (s[1][i] == 4) { // do nothing + } else if (s[0][k] == s[1][i]) { + qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; + } else { // s[0][k] != s[1][i] and neither is N + int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; + sum_q += qq >= 3? qq<<1 : 1; + seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; + qual[k] = abs((int)q[0][k] - (int)q[1][i]); + } + } + if (sum_q>>1 > opt->q_thres) { // too many mismatches + free(seq); free(qual); + ret = -8; goto pem_ret; + } + + for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; + seq[l_seq] = qual[l_seq] = 0; + + free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); + memset(&x[1], 0, sizeof(bseq1_t)); + free(x[0].seq); free(x[0].qual); + x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; + +pem_ret: + free(s[0]); free(s[1]); free(q[0]); free(q[1]); + return ret; +} + +static inline void print_bseq(const bseq1_t *s, int rn) +{ + err_putchar(s->qual? '@' : '>'); + err_fputs(s->name, stdout); + if (rn == 1 || rn == 2) { + err_putchar('/'); err_putchar('0' + rn); err_putchar('\n'); + } else err_puts(" merged"); + err_puts(s->seq); + if (s->qual) { + err_puts("+"); err_puts(s->qual); + } +} + +typedef struct { + int n, start; + bseq1_t *seqs; + int64_t cnt[MAX_ERR+1]; + const pem_opt_t *opt; +} worker_t; + +void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + int i; + for (i = w->start; i < w->n>>1; i += w->opt->n_threads) + ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; + return 0; +} + +static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) +{ + int i, j, n = n_>>1<<1; + worker_t *w; + + w = calloc(opt->n_threads, sizeof(worker_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + p->start = i; p->n = n; + p->opt = opt; + p->seqs = seqs; + } + if (opt->n_threads == 1) { + worker(w); + } else { + pthread_t *tid; + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; + } + free(w); + for (i = 0; i < n>>1; ++i) { + if (seqs[i<<1|1].l_seq != 0) { + if (opt->flag&2) { + print_bseq(&seqs[i<<1|0], 1); + print_bseq(&seqs[i<<1|1], 2); + } + } else if (opt->flag&1) + print_bseq(&seqs[i<<1|0], 0); + } + for (i = 0; i < n; ++i) { + bseq1_t *s = &seqs[i]; + free(s->name); free(s->seq); free(s->qual); free(s->comment); + } +} + +int main_pemerge(int argc, char *argv[]) +{ + int c, flag = 0, i, n, min_ovlp = 10; + int64_t cnt[MAX_ERR+1]; + bseq1_t *bseq; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + pem_opt_t *opt; + + opt = pem_opt_init(); + while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { + if (c == 'm') flag |= 1; + else if (c == 'u') flag |= 2; + else if (c == 'Q') opt->q_thres = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg); + else if (c == 'T') min_ovlp = atoi(optarg); + else return 1; + } + if (flag == 0) flag = 3; + opt->flag = flag; + opt->T = opt->a * min_ovlp; + + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); + fprintf(stderr, "Options: -m output merged reads only\n"); + fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); + fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[optind], "-") ? argv[optind] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks = kseq_init(fp); + if (optind + 1 < argc) { + fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks2 = kseq_init(fp2); + } + + memset(cnt, 0, 8 * (MAX_ERR+1)); + while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + process_seqs(opt, n, bseq, cnt); + free(bseq); + } + + fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); + for (i = 1; i <= MAX_ERR; ++i) + fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); + kseq_destroy(ks); + err_gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + err_gzclose(fp2); + } + free(opt); + + err_fflush(stdout); + + return 0; +} diff --git a/src/bwa/utils.c b/src/bwa/utils.c new file mode 100644 index 000000000..298326174 --- /dev/null +++ b/src/bwa/utils.c @@ -0,0 +1,295 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ +#define FSYNC_ON_FLUSH + +#include +#include +#include +#include +#include +#include +#ifdef FSYNC_ON_FLUSH +#include +#include +#include +#endif +#include +#include +#include "utils.h" + +#include "ksort.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) +KSORT_INIT(128, pair64_t, pair64_lt) +KSORT_INIT(64, uint64_t, ks_lt_generic) + +#include "kseq.h" +KSEQ_INIT2(, gzFile, err_gzread) + +/******************** + * System utilities * + ********************/ + +FILE *err_xopen_core(const char *func, const char *fn, const char *mode) +{ + FILE *fp = 0; + if (strcmp(fn, "-") == 0) + return (strstr(mode, "r"))? stdin : stdout; + if ((fp = fopen(fn, mode)) == 0) { + err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); + } + return fp; +} + +FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) +{ + if (freopen(fn, mode, fp) == 0) { + err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); + } + return fp; +} + +gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) +{ + gzFile fp; + if (strcmp(fn, "-") == 0) { + fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + /* According to zlib.h, this is the only reason gzdopen can fail */ + if (!fp) err_fatal(func, "Out of memory"); + return fp; + } + if ((fp = gzopen(fn, mode)) == 0) { + err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory"); + } + return fp; +} + +void err_fatal(const char *header, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "[%s] ", header); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); + exit(EXIT_FAILURE); +} + +void err_fatal_core(const char *header, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "[%s] ", header); + vfprintf(stderr, fmt, args); + fprintf(stderr, " Abort!\n"); + va_end(args); + abort(); +} + +void _err_fatal_simple(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s\n", func, msg); + exit(EXIT_FAILURE); +} + +void _err_fatal_simple_core(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s Abort!\n", func, msg); + abort(); +} + +size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + _err_fatal_simple("fwrite", strerror(errno)); + return ret; +} + +size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fread(ptr, size, nmemb, stream); + if (ret != nmemb) + { + _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file"); + } + return ret; +} + +int err_gzread(gzFile file, void *ptr, unsigned int len) +{ + int ret = gzread(file, ptr, len); + + if (ret < 0) + { + int errnum = 0; + const char *msg = gzerror(file, &errnum); + _err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg); + } + + return ret; +} + +int err_fseek(FILE *stream, long offset, int whence) +{ + int ret = fseek(stream, offset, whence); + if (0 != ret) + { + _err_fatal_simple("fseek", strerror(errno)); + } + return ret; +} + +long err_ftell(FILE *stream) +{ + long ret = ftell(stream); + if (-1 == ret) + { + _err_fatal_simple("ftell", strerror(errno)); + } + return ret; +} + +int err_printf(const char *format, ...) +{ + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); + return done; +} + +int err_fprintf(FILE *stream, const char *format, ...) +{ + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno)); + return done; +} + +int err_fputc(int c, FILE *stream) +{ + int ret = putc(c, stream); + if (EOF == ret) + { + _err_fatal_simple("fputc", strerror(errno)); + } + + return ret; +} + +int err_fputs(const char *s, FILE *stream) +{ + int ret = fputs(s, stream); + if (EOF == ret) + { + _err_fatal_simple("fputs", strerror(errno)); + } + + return ret; +} + +int err_puts(const char *s) +{ + int ret = puts(s); + if (EOF == ret) + { + _err_fatal_simple("puts", strerror(errno)); + } + + return ret; +} + +int err_fflush(FILE *stream) +{ + int ret = fflush(stream); + if (ret != 0) _err_fatal_simple("fflush", strerror(errno)); + +#ifdef FSYNC_ON_FLUSH + /* Calling fflush() ensures that all the data has made it to the + kernel buffers, but this may not be sufficient for remote filesystems + (e.g. NFS, lustre) as an error may still occur while the kernel + is copying the buffered data to the file server. To be sure of + catching these errors, we need to call fsync() on the file + descriptor, but only if it is a regular file. */ + { + struct stat sbuf; + if (0 != fstat(fileno(stream), &sbuf)) + _err_fatal_simple("fstat", strerror(errno)); + + if (S_ISREG(sbuf.st_mode)) + { + if (0 != fsync(fileno(stream))) + _err_fatal_simple("fsync", strerror(errno)); + } + } +#endif + return ret; +} + +int err_fclose(FILE *stream) +{ + int ret = fclose(stream); + if (ret != 0) _err_fatal_simple("fclose", strerror(errno)); + return ret; +} + +int err_gzclose(gzFile file) +{ + int ret = gzclose(file); + if (Z_OK != ret) + { + _err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret)); + } + + return ret; +} + +/********* + * Timer * + *********/ + +double cputime() +{ + struct rusage r; + getrusage(RUSAGE_SELF, &r); + return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); +} + +double realtime() +{ + struct timeval tp; + struct timezone tzp; + gettimeofday(&tp, &tzp); + return tp.tv_sec + tp.tv_usec * 1e-6; +} diff --git a/src/bwa/utils.h b/src/bwa/utils.h new file mode 100644 index 000000000..11966b8db --- /dev/null +++ b/src/bwa/utils.h @@ -0,0 +1,111 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef LH3_UTILS_H +#define LH3_UTILS_H + +#include +#include +#include + +#ifdef __GNUC__ +// Tell GCC to validate printf format string and args +#define ATTRIBUTE(list) __attribute__ (list) +#else +#define ATTRIBUTE(list) +#endif + +#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg) +#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg) + +#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) +#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) +#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) + +#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) + +typedef struct { + uint64_t x, y; +} pair64_t; + +typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; pair64_t *a; } pair64_v; + +#ifdef __cplusplus +extern "C" { +#endif + + void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); + void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); + void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn)); + void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn)); + FILE *err_xopen_core(const char *func, const char *fn, const char *mode); + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); + size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); + size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); + + int err_gzread(gzFile file, void *ptr, unsigned int len); + int err_fseek(FILE *stream, long offset, int whence); +#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET) + long err_ftell(FILE *stream); + int err_fprintf(FILE *stream, const char *format, ...) + ATTRIBUTE((format(printf, 2, 3))); + int err_printf(const char *format, ...) + ATTRIBUTE((format(printf, 1, 2))); + int err_fputc(int c, FILE *stream); +#define err_putchar(C) err_fputc((C), stdout) + int err_fputs(const char *s, FILE *stream); + int err_puts(const char *s); + int err_fflush(FILE *stream); + int err_fclose(FILE *stream); + int err_gzclose(gzFile file); + + double cputime(); + double realtime(); + + void ks_introsort_64 (size_t n, uint64_t *a); + void ks_introsort_128(size_t n, pair64_t *a); + +#ifdef __cplusplus +} +#endif + +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} + +#endif