commit dfb12f36e61fa1e7eee95b322935d1ac6c0043e3 Author: SVN-Git Migration Date: Thu Oct 8 09:26:18 2015 -0700 Imported Upstream version 1.1.1 diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/COPYING.LESSER b/COPYING.LESSER new file mode 100644 index 0000000..602bfc9 --- /dev/null +++ b/COPYING.LESSER @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/HACKING b/HACKING new file mode 100644 index 0000000..461ee79 --- /dev/null +++ b/HACKING @@ -0,0 +1,191 @@ +================================ +Some notes on hacking on kitchen +================================ + +:Author: Toshio Kuratomi +:Date: 2 Jan 2012 +:Version: 1.1.x + +For coding and kitchen, see the style guide in the documentation. + +This file documents meta-information about kitchen such as where to get the +code and how to make a release. + +.. contents:: + +----------------------------------------- +Extra software needed for making releases +----------------------------------------- +Although kitchen has very few requirements for running, there are a few more +that are required for making a release: + +* python-2.4+ (tested on python-2.7) +* transifex-client (/usr/bin/tx) +* gettext (/usr/bin/msgfmt) +* python-babel (/usr/bin/pybabel) +* python-sphinx (/usr/bin/sphinx-build) +* python-nose (/usr/bin/nosetests) +* python-coverage (/usr/bin/coverage) + +-------------- +Get translated +-------------- + +We use the translation services at transifex.net to manage po files, coordinate +people translating strings, and merge new strings to the files. The following +instructions briefly tell how to use transifex to update the source languages' +files and pull new translations for release. Actually doing translations can +be found in the `transifex user's guide`_. + +.. `transifex user's guide`:: http://help.transifex.net/user-guide/translating.html + +To generate the POT file (located in the po/ subdirectory), use pybabel to +extract the messages. Tun the following from the top level directory:: + + pybabel extract -o po/kitchen.pot kitchen -kb_ -kbN_ + +Then commit this pot file and upload to transifex:: + + tx push -s + bzr commit -m 'Extract new strings from the source files' po/kitchen.pot + bzr push + +To pull messages from transifex prior to making a release, do:: + + tx pull -a + bzr commit -m 'Merge new translations from transifex' po/*.po + +If you see a status message from transifex like this:: + Pulling new translations for resource kitchen.kitchenpot (source: po/kitchen.pot) + -> fr: po/fr.po + +it means that transifex has created a brand new po file for you. You need to +add the new file to source control and commit it like this:: + + bzr add po/fr.po + bzr commit -m 'New French translation' po/fr.po + + +TODO: Add information about announcing string freeze. Using transifex's add +release function to coordinate with translators. Mailing a translators list, +etc. + +-------- +Releases +-------- + +.. note:: If a release is not time critical, make an effort to get the + software translated first. See :id:`Get translated` for details. + +Testing +======= + +Even though python is a compiled language, there's several ways to test that +the software is correct. + +Test that docs build +-------------------- + +Documentation is written in ReStructuredText format and built via the +:mod:`sphinx` documentation system for python. There is a variety of +hand-written and formatted documentation in the :file:`docs` directory. Those +documents also pull some documentation out of the docstrings in the code. + +Any of those places may have formatting that is not valid in the sphinx +system. Building the documentation into html will see if there's any spots +that need to be fixed:: + + python setup.py build_sphinx --fresh-env + +The command will attempt to turn the documentation into html. Any errors or +warnings in the output mean that there's some piece of documentation that +sphinx doesn't know how to deal with. That should be fixed before publishing +the release. + + +Test that message catalogs compile +---------------------------------- + +One of the pieces of creating a new release is downloading new message +catalogs from transifex. Once in a great while, a translator will upload a +translation there that causes problems (for instance, adding or omitting +format strings from a translated string.) Luckily the commands to create the +message catalogs will detect things like this so just compiling the catalogs +will determine if any translations need to be adjusted:: + + ./releaseutils.py + +This will iterate through all the message catalogs that transifex downloaded +to the :file:`po` directory and compile them into the :file:`locale` +directory. + +.. warning:: If :file:/usr/bin/msgfmt is not installed, this command will still + compile the message catalogs but it will use babel. Babel, unfortunately, + doesn't check for all the errors in message catalogs that msgfmt does so + it may say that the messages are fine when they really aren't. Make sure + you have msgfmt available by installing gettext. + +Unittest +-------- + +Kitchen has a large set of unittests. All of them should pass before release. +You can run the unittests with the following command:: + nosetests --with-coverage --cover-package kitchen + +This will run all the unittests under the tests directory and also generate +some statistics about which lines of code were not accessed when kitchen ran. + +.. warning:: Although 100% test coverage is a worthy goal, it doesn't mean + that the code is bug free. This is especially true of code, like + kitchen's, that deals with encoding issues. The same piece of code in + kitchen will do different things depending on whether unicode or byte str + (and the characters that are in the byte str) is passed as a parameter and + what encoding is specified in certain environment variables. You can take + a look at :file:`test_i18n.py` and :file:`test_converters.py` to see tests + that attempt to cover enough input values to detect problems. + +Since kitchen is currently supported on python-2.3.1+, it is desirable to test +kitchen on at least one python major version from python-2.3 through +python-2.7. We currently have access to a buildbot that has access to +python-2.4, python-2.6, and python-2.7. You can view it at +http://ci.csh.rit.edu:8080/view/Kitchen/ . The buildbot checks the devel +repository hourly and if new checkins have occurred, it attempts to rebuild. +If you need access to invoke builds on the buildbot more regularly than that, +contact Toshio to get access. + +We were unable to get python-2.3 working in the buildbot so I manually run the +unittests on a CentOS-4 virtual machine (with python-2.3). I currently don't +test on python-2.5 but I'd be happy to take bug reports or get a new committer +that was interested in that platform. + +Creating the release +==================== + +1. Make sure that any feature branches you want have been merged. +2. Pull in new translations and verify they are valid:: + tx pull -a + # If msgfmt is installed, this will check that the catalogs are valid + ./releaseutils.py + bzr commit -m 'Merge new translations from transifex.net' +3. Update the version in kitchen/__init__.py and NEWS. +4. Make a fresh clone of the repository:: + cd $PATH_TO_MY_SHARED_REPO + bzr branch bzr://bzr.fedorahosted.org/bzr/kitchen/devel release +5. Make the source tarball in that directory:: + cd release + python setup.py sdist +6. Make sure that the source tarball contains all of the files we want in the release:: + cd .. + tar -xzvf release/dist/kitchen*tar.gz + diff -uNr devel kitchen-$RELEASE_VERSION +7. Upload the docs to pypi:: + cd release + python setup.py upload_docs +8. Upload the tarball to pypi:: + python setup.py sdist upload --sign +9. Upload the tarball to fedorahosted:: + scp dist/kitchen*tar.gz fedorahosted.org:/srv/web/releases/k/i/kitchen/ +10. Tag the release:: + cd ../devel + bzr tag $RELEASE_VERSION + bzr push diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..a917d2b --- /dev/null +++ b/NEWS @@ -0,0 +1,170 @@ +==== +NEWS +==== + +:Authors: Toshio Kuratomi +:Date: 14 Feb 2012 +:Version: 1.1.1 + +----- +1.1.1 +----- + +* Fix a bug with easy_gettext_setup() and get_translation_object() when using + the default value of localedirs. + +----- +1.1.0 +----- + +* Add yum.i18n.exception2msg section to the porting docs +* Deprecate BYTE_EXCEPTION_CONVERTERS as simplification of code lets + us use EXCEPTION_CONVERTERS for both exception_to_unicode and + exception_to_bytes. +* kitchen.i18n.get_translation_object + - Add more parameters to :func:`~kitchen.i18n.get_translation_object` so it + can more easily be used as a replacement for :func:`gettext.translation`. + - Change the way we use localedirs. We cycle through them until we find a + suitable locale file rather than simply cycling through until we find a + directory that exists. + - When multiple message catalogs are found in localedirs (and via environment + variables), set up the extra ones as fallbacks if the message isn't found + in the first catalog. +* Change the return values from gettext and lgettext family of functions. + Instead of simply guaranteeing a byte str will be returned we now guarantee + the byte str will be valid in a certain encoding (the str may still be + mangled but it will be valid). +* Updated subprocess and base64 modules from latest python-2.7 branch. +* Fix i18n Translation objects to set input_charset and output_charset on any + fallback objects. +* Fix kitchen.i18n Translation objects' output_encoding() method on python-2.3. + It was accessing a different self object than we wanted it to. Defining it + in a different way makes it work on python-2.3. + +----- +1.0.0 +----- + +* Add a pointer to ordereddict and iterutils in the docs +* Change a few pieces of code to not internally mix bytes and unicode + +----- +0.2.4 +----- + +* Have easy_gettext_setup return lgettext functions instead of gettext + functions when use_unicode=False +* Correct docstring for kitchen.text.converters.exception_to_bytes() -- we're + transforming into a byte str, not into unicode. +* Correct some examples in the unicode frustrations documentation +* Correct some cross-references in the documentation + +----- +0.2.3 +----- + +* Expose MAXFD, list2cmdline(), and mswindows in kitchen.pycompat27.subprocess. + These are undocumented, and not in upstream's __all__ but google (and bug + reports against kitchen) show that some people are using them. Note that + upstream is leaning towards these being private so they may be deprecated in + the python3 subprocess. + +----- +0.2.2 +----- + +* Add kitchen.text.converters.exception_to_bytes() and + kitchen.text.converters.exception_to_unicode() that take an exception object + and convert it into a text representation. +* Add a documentation section on how API can be simplified if you can limit your encodings + +If all goes well, we'll be making a 1.0 release shortly which is basically this release. + +------- +0.2.2a1 +------- + +* Fix exception messages that contain unicode characters +* Speed up to_unicode for the common cases of utf-8 and latin-1. +* kitchen.i18n.NewGNUTranslations object that always returns unicode for + ugettext and ungettext, always returns str for the other gettext functions, + and doesn't throw UnicodeError. +* Change i18n functions to return either DummyTranslations or + NewGNUTranslations so all strings returned are known to be unicode or str. +* kitchen.pycompat24.base64 now synced from upstream python so it implements + all of the python-2.4 API +* unittest NewGNUTranslations +* unittest that easy_gettext_setup returns the correct objects +* Document kitchen.text.display +* Proofread all of the documentation. Cross reference to the stdlib. +* Write a porting guide for people porting from python-fedora and yum APIs. + +------- +0.2.1a1 +------- + +* Fix failing unittest on python-2.7 +* Add iterutils module +* Update table of combining utf8 characters from python-2.7 +* Speed up kitchen.text.misc.str_eq(). +* docs: + - api-i18n + - api-exceptions + - api-collections + - api-iterutils + - Add two tutorial sections for unicode +* unittests + - kitchen.text.converters.getwriter() + - kitchen.iterutils + - tests for more input variations to str_eq + +----- +0.2a2 +----- +* Add unittests for kitchen.text.display, update kitchen.text.utf8 and + kitchen.text.misc test coverage +* Bug fixes for python-2.3 +* Some doc updates. More to come. +* New function kitchen.text.converters.getwriter() + +----- +0.2a1 +----- +* Relicense to LGPLv2+ +* All API versions for subpackages moved to 1.0 to comply with new guidelines + on hacking subpackages. +* Documentation on hacking kitchen and addons +* Kitchen.text API changed (new API version 1.0) + * Move utils.* to misc.* + * Deprecate kitchen.text.utf8.utf8_valid in favor of + kitchen.text.misc.byte_string_valid_encoding + - byte_string_valid_encoding is significantly faster and a bit more generic + * Port utf8 functions to use unicode + * Put the unicode versions of the utf8 functions into kitchen.text.display + +----- +0.1a3 +----- +* Add a defaultdict implementation for pycompat25 +* Add documentation +* Add a StrictDict class that never has str and unicode keys collide. + +----- +0.1a2 +----- +* Fixes for python-2.3 +* versioning subpackage with version_tuple_to_string() function that creates + PEP-386 compatible version strings. +* Changed pycompat24.builtinset -- now you need to call the add_builtin_set() + function to add set and frozenset to the __builtin__ namespace. +* pycompat24.base64modern module that implements the modern interface to + encode and decode base64. Note that it does't implement b32 or b16 at the + moment. +* pycompat27 with the 2.7 version of subprocess. +* The 2.7 version of subprocess is also available at + kitchen.pycompat24.subprocess since subprocess first appeared in python2.4 + +----- +0.1a1 +----- +* Initial releae of kitchen.core diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..3e95af9 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,39 @@ +Metadata-Version: 1.0 +Name: kitchen +Version: 1.1.1 +Summary: Kitchen contains a cornucopia of useful code +Home-page: https://fedorahosted.org/kitchen +Author: Toshio Kuratomi +Author-email: toshio@fedoraproject.org +License: LGPLv2+ +Download-URL: https://fedorahosted.org/releases/k/i/kitchen +Description: + We've all done it. In the process of writing a brand new application we've + discovered that we need a little bit of code that we've invented before. + Perhaps it's something to handle unicode text. Perhaps it's something to make + a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up being + a tiny bit of code that seems too small to worry about pushing into its own + module so it sits there, a part of your current project, waiting to be cut and + pasted into your next project. And the next. And the next. And since that + little bittybit of code proved so useful to you, it's highly likely that it + proved useful to someone else as well. Useful enough that they've written it + and copy and pasted it over and over into each of their new projects. + + Well, no longer! Kitchen aims to pull these small snippets of code into a few + python modules which you can import and use within your project. No more copy + and paste! Now you can let someone else maintain and release these small + snippets so that you can get on with your life. + +Keywords: Useful Small Code Snippets +Platform: UNKNOWN +Classifier: Development Status :: 4 - Beta +Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL) +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 2.3 +Classifier: Programming Language :: Python :: 2.4 +Classifier: Programming Language :: Python :: 2.5 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Topic :: Software Development :: Internationalization +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing :: General diff --git a/README b/README new file mode 100644 index 0000000..fc59df2 --- /dev/null +++ b/README @@ -0,0 +1,81 @@ +=================== +Kitchen.core Module +=================== + +:Author: Toshio Kuratomi +:Date: 2 Jan 2012 +:Version: 1.1.x + +The Kitchen module provides a python API for all sorts of little useful +snippets of code that everybody ends up writing for their projects but never +seem big enough to build an independent release. Use kitchen and stop cutting +and pasting that code over and over. + +.. contents:: + +------- +License +------- + +Since version 0.2a1, this python module has been distributed under the terms of +the GNU Lesser General Public License Version 2 or later. + +.. note:: Some parts of this module are licensed under terms less restrictive + than the LGPL. If you separate these files from the work as a whole you + are allowed to use them under the less restrictive licenses. The following + is a list of the files that are known: + + :subprocess.py: licensed under the Python 2 license by the PSF + http://www.python.org/download/releases/2.4/license/ + :test_subprocess.py: Python Software Foundation License Version 2 + http://www.python.org/download/releases/2.7/license/ + :kitchen/pycompat25/defaultdict.py: Python Software Foundation License Version 2 + http://www.python.org/download/releases/2.6.2/license + +------------ +Requirements +------------ + +kitchen.core requires + +:python: 2.3.1 or later + +Soft Requirements +================= + +If found, these libraries will be used to make the implementation of soemthing +better in some way. If they are not present, the API that they enable will +still exist but may function in a different manner. + +:chardet_: Used in kitchen.text.xml.guess_encoding__to_xml() to help guess encoding of + byte strings being converted. If not present, unknown encodings will be + converted as if they were latin1. + +.. _chardet:: http://chardet.feedparser.org/ + +--------------------------- +Other Recommended Libraries +--------------------------- + +These libraries implement commonly used functionality that everyone seems to +invent. Rather than reinvent their wheel, I simply list the things that they +do well for now. Perhaps if people can't find them normally, I'll add them as +requirements in setup.py or link them into kitchen's namespace. For now, I +just mention them here: + +:bunch_: Bunch is a dictionary that you can use attribute lookup as well as + bracket notation to access. Setting it apart from most homebrewed + implementations is the bunchify() function which will descend nested + structures of lists nad dicts, transforming the dicts to Bunch's. + +.. _bunch:: http://pypi.python.org/pypi/bunch/ + +--------------------- +Building, and testing +--------------------- + +Testing +======= + +You can run the unittests with this command:: + nosetests --with-coverage --cover-package kitchen diff --git a/docs/api-collections.rst b/docs/api-collections.rst new file mode 100644 index 0000000..30e0a20 --- /dev/null +++ b/docs/api-collections.rst @@ -0,0 +1,6 @@ +=================== +Kitchen.collections +=================== + +.. automodule:: kitchen.collections.strictdict + :members: diff --git a/docs/api-exceptions.rst b/docs/api-exceptions.rst new file mode 100644 index 0000000..ebc4bec --- /dev/null +++ b/docs/api-exceptions.rst @@ -0,0 +1,12 @@ +========== +Exceptions +========== + +Kitchen has a hierarchy of exceptions that should make it easy to catch many +errors emitted by kitchen itself. + +.. automodule:: kitchen.exceptions + :members: + +.. automodule:: kitchen.text.exceptions + :members: diff --git a/docs/api-i18n.rst b/docs/api-i18n.rst new file mode 100644 index 0000000..dc6fb4f --- /dev/null +++ b/docs/api-i18n.rst @@ -0,0 +1,38 @@ +=================== +Kitchen.i18n Module +=================== + +.. automodule:: kitchen.i18n + +Functions +========= + +:func:`easy_gettext_setup` should satisfy the needs of most users. +:func:`get_translation_object` is designed to ease the way for anyone that +needs more control. + +.. autofunction:: easy_gettext_setup + +.. autofunction:: get_translation_object + +Translation Objects +=================== + +The standard translation objects from the :mod:`gettext` module suffer from +several problems: + +* They can throw :exc:`UnicodeError` +* They can't find translations for non-:term:`ASCII` byte :class:`str` + messages +* They may return either :class:`unicode` string or byte :class:`str` from the + same function even though the functions say they will only return + :class:`unicode` or only return byte :class:`str`. + +:class:`DummyTranslations` and :class:`NewGNUTranslations` were written to fix +these issues. + +.. autoclass:: kitchen.i18n.DummyTranslations + :members: + +.. autoclass:: kitchen.i18n.NewGNUTranslations + :members: diff --git a/docs/api-iterutils.rst b/docs/api-iterutils.rst new file mode 100644 index 0000000..2bc785c --- /dev/null +++ b/docs/api-iterutils.rst @@ -0,0 +1,9 @@ + +======================== +Kitchen.iterutils Module +======================== + +.. automodule:: kitchen.iterutils + +.. autofunction:: kitchen.iterutils.isiterable +.. autofunction:: kitchen.iterutils.iterate diff --git a/docs/api-overview.rst b/docs/api-overview.rst new file mode 100644 index 0000000..dda56fe --- /dev/null +++ b/docs/api-overview.rst @@ -0,0 +1,24 @@ +.. _KitchenAPI: + +=========== +Kitchen API +=========== + +Kitchen is structured as a collection of modules. In its current +configuration, Kitchen ships with the following modules. Other addon modules +that may drag in more dependencies can be found on the `project webpage`_ + +.. toctree:: + :maxdepth: 2 + + api-i18n + api-text + api-collections + api-iterutils + api-versioning + api-pycompat24 + api-pycompat25 + api-pycompat27 + api-exceptions + +.. _`project webpage`: https://fedorahosted.org/kitchen diff --git a/docs/api-pycompat24.rst b/docs/api-pycompat24.rst new file mode 100644 index 0000000..a3247b6 --- /dev/null +++ b/docs/api-pycompat24.rst @@ -0,0 +1,34 @@ +======================= +Python 2.4 Compatibiity +======================= + + +------------------- +Sets for python-2.3 +------------------- + +.. automodule:: kitchen.pycompat24.sets +.. autofunction:: kitchen.pycompat24.sets.add_builtin_set + +---------------------------------- +Partial new style base64 interface +---------------------------------- + +.. automodule:: kitchen.pycompat24.base64 + :members: + +---------- +Subprocess +---------- + +.. seealso:: + + :mod:`kitchen.pycompat27.subprocess` + Kitchen includes the python-2.7 version of subprocess which has a new + function, :func:`~kitchen.pycompat27.subprocess.check_output`. When + you import :mod:`pycompat24.subprocess` you will be getting the + python-2.7 version of subprocess rather than the 2.4 version (where + subprocess first appeared). This choice was made so that we can + concentrate our efforts on keeping the single version of subprocess up + to date rather than working on a 2.4 version that very few people + would need specifically. diff --git a/docs/api-pycompat25.rst b/docs/api-pycompat25.rst new file mode 100644 index 0000000..1841c6a --- /dev/null +++ b/docs/api-pycompat25.rst @@ -0,0 +1,8 @@ +======================== +Python 2.5 Compatibility +======================== + +.. automodule:: kitchen.pycompat25 + +.. automodule:: kitchen.pycompat25.collections._defaultdict + diff --git a/docs/api-pycompat27.rst b/docs/api-pycompat27.rst new file mode 100644 index 0000000..6ef6db1 --- /dev/null +++ b/docs/api-pycompat27.rst @@ -0,0 +1,35 @@ +======================== +Python 2.7 Compatibility +======================== + +.. module:: kitchen.pycompat27.subprocess + +-------------------------- +Subprocess from Python 2.7 +-------------------------- + +The :mod:`subprocess` module included here is a direct import from +python-2.7's |stdlib|_. You can access it via:: + + >>> from kitchen.pycompat27 import subprocess + +The motivation for including this module is that various API changing +improvements have been made to subprocess over time. The following is a list +of the known changes to :mod:`subprocess` with the python version they were +introduced in: + +==================================== === +New API Feature Ver +==================================== === +:exc:`subprocess.CalledProcessError` 2.5 +:func:`subprocess.check_call` 2.5 +:func:`subprocess.check_output` 2.7 +:meth:`subprocess.Popen.send_signal` 2.6 +:meth:`subprocess.Popen.terminate` 2.6 +:meth:`subprocess.Popen.kill` 2.6 +==================================== === + +.. seealso:: + + The stdlib :mod:`subprocess` documenation + For complete documentation on how to use subprocess diff --git a/docs/api-text-converters.rst b/docs/api-text-converters.rst new file mode 100644 index 0000000..2542ce2 --- /dev/null +++ b/docs/api-text-converters.rst @@ -0,0 +1,405 @@ +----------------------- +Kitchen.text.converters +----------------------- + +.. automodule:: kitchen.text.converters + +Byte Strings and Unicode in Python2 +=================================== + +Python2 has two string types, :class:`str` and :class:`unicode`. +:class:`unicode` represents an abstract sequence of text characters. It can +hold any character that is present in the unicode standard. :class:`str` can +hold any byte of data. The operating system and python work together to +display these bytes as characters in many cases but you should always keep in +mind that the information is really a sequence of bytes, not a sequence of +characters. In python2 these types are interchangeable a large amount of the +time. They are one of the few pairs of types that automatically convert when +used in equality:: + + >>> # string is converted to unicode and then compared + >>> "I am a string" == u"I am a string" + True + >>> # Other types, like int, don't have this special treatment + >>> 5 == "5" + False + +However, this automatic conversion tends to lull people into a false sense of +security. As long as you're dealing with :term:`ASCII` characters the +automatic conversion will save you from seeing any differences. Once you +start using characters that are not in :term:`ASCII`, you will start getting +:exc:`UnicodeError` and :exc:`UnicodeWarning` as the automatic conversions +between the types fail:: + + >>> "I am an ñ" == u"I am an ñ" + __main__:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal + False + +Why do these conversions fail? The reason is that the python2 +:class:`unicode` type represents an abstract sequence of unicode text known as +:term:`code points`. :class:`str`, on the other hand, really represents +a sequence of bytes. Those bytes are converted by your operating system to +appear as characters on your screen using a particular encoding (usually +with a default defined by the operating system and customizable by the +individual user.) Although :term:`ASCII` characters are fairly standard in +what bytes represent each character, the bytes outside of the :term:`ASCII` +range are not. In general, each encoding will map a different character to +a particular byte. Newer encodings map individual characters to multiple +bytes (which the older encodings will instead treat as multiple characters). +In the face of these differences, python refuses to guess at an encoding and +instead issues a warning or exception and refuses to convert. + +.. seealso:: + :ref:`overcoming-frustration` + For a longer introduction on this subject. + +Strategy for Explicit Conversion +================================ + +So what is the best method of dealing with this weltering babble of incoherent +encodings? The basic strategy is to explicitly turn everything into +:class:`unicode` when it first enters your program. Then, when you send it to +output, you can transform the unicode back into bytes. Doing this allows you +to control the encodings that are used and avoid getting tracebacks due to +:exc:`UnicodeError`. Using the functions defined in this module, that looks +something like this: + +.. code-block:: pycon + :linenos: + + >>> from kitchen.text.converters import to_unicode, to_bytes + >>> name = raw_input('Enter your name: ') + Enter your name: Toshio くらとみ + >>> name + 'Toshio \xe3\x81\x8f\xe3\x82\x89\xe3\x81\xa8\xe3\x81\xbf' + >>> type(name) + + >>> unicode_name = to_unicode(name) + >>> type(unicode_name) + + >>> unicode_name + u'Toshio \u304f\u3089\u3068\u307f' + >>> # Do a lot of other things before needing to save/output again: + >>> output = open('datafile', 'w') + >>> output.write(to_bytes(u'Name: %s\\n' % unicode_name)) + +A few notes: + +Looking at line 6, you'll notice that the input we took from the user was +a byte :class:`str`. In general, anytime we're getting a value from outside +of python (The filesystem, reading data from the network, interacting with an +external command, reading values from the environment) we are interacting with +something that will want to give us a byte :class:`str`. Some |stdlib|_ +modules and third party libraries will automatically attempt to convert a byte +:class:`str` to :class:`unicode` strings for you. This is both a boon and +a curse. If the library can guess correctly about the encoding that the data +is in, it will return :class:`unicode` objects to you without you having to +convert. However, if it can't guess correctly, you may end up with one of +several problems: + +:exc:`UnicodeError` + The library attempted to decode a byte :class:`str` into + a :class:`unicode`, string failed, and raises an exception. +Garbled data + If the library returns the data after decoding it with the wrong encoding, + the characters you see in the :exc:`unicode` string won't be the ones that + you expect. +A byte :class:`str` instead of :class:`unicode` string + Some libraries will return a :class:`unicode` string when they're able to + decode the data and a byte :class:`str` when they can't. This is + generally the hardest problem to debug when it occurs. Avoid it in your + own code and try to avoid or open bugs against upstreams that do this. See + :ref:`DesigningUnicodeAwareAPIs` for strategies to do this properly. + +On line 8, we convert from a byte :class:`str` to a :class:`unicode` string. +:func:`~kitchen.text.converters.to_unicode` does this for us. It has some +error handling and sane defaults that make this a nicer function to use than +calling :meth:`str.decode` directly: + +* Instead of defaulting to the :term:`ASCII` encoding which fails with all + but the simple American English characters, it defaults to :term:`UTF-8`. +* Instead of raising an error if it cannot decode a value, it will replace + the value with the unicode "Replacement character" symbol (``�``). +* If you happen to call this method with something that is not a :class:`str` + or :class:`unicode`, it will return an empty :class:`unicode` string. + +All three of these can be overridden using different keyword arguments to the +function. See the :func:`to_unicode` documentation for more information. + +On line 15 we push the data back out to a file. Two things you should note here: + +1. We deal with the strings as :class:`unicode` until the last instant. The + string format that we're using is :class:`unicode` and the variable also + holds :class:`unicode`. People sometimes get into trouble when they mix + a byte :class:`str` format with a variable that holds a :class:`unicode` + string (or vice versa) at this stage. +2. :func:`~kitchen.text.converters.to_bytes`, does the reverse of + :func:`to_unicode`. In this case, we're using the default values which + turn :class:`unicode` into a byte :class:`str` using :term:`UTF-8`. Any + errors are replaced with a ``�`` and sending nonstring objects yield empty + :class:`unicode` strings. Just like :func:`to_unicode`, you can look at + the documentation for :func:`to_bytes` to find out how to override any of + these defaults. + +When to use an alternate strategy +--------------------------------- + +The default strategy of decoding to :class:`unicode` strings when you take +data in and encoding to a byte :class:`str` when you send the data back out +works great for most problems but there are a few times when you shouldn't: + +* The values aren't meant to be read as text +* The values need to be byte-for-byte when you send them back out -- for + instance if they are database keys or filenames. +* You are transferring the data between several libraries that all expect + byte :class:`str`. + +In each of these instances, there is a reason to keep around the byte +:class:`str` version of a value. Here's a few hints to keep your sanity in +these situations: + +1. Keep your :class:`unicode` and :class:`str` values separate. Just like the + pain caused when you have to use someone else's library that returns both + :class:`unicode` and :class:`str` you can cause yourself pain if you have + functions that can return both types or variables that could hold either + type of value. +2. Name your variables so that you can tell whether you're storing byte + :class:`str` or :class:`unicode` string. One of the first things you end + up having to do when debugging is determine what type of string you have in + a variable and what type of string you are expecting. Naming your + variables consistently so that you can tell which type they are supposed to + hold will save you from at least one of those steps. +3. When you get values initially, make sure that you're dealing with the type + of value that you expect as you save it. You can use :func:`isinstance` + or :func:`to_bytes` since :func:`to_bytes` doesn't do any modifications of + the string if it's already a :class:`str`. When using :func:`to_bytes` + for this purpose you might want to use:: + + try: + b_input = to_bytes(input_should_be_bytes_already, errors='strict', nonstring='strict') + except: + handle_errors_somehow() + + The reason is that the default of :func:`to_bytes` will take characters + that are illegal in the chosen encoding and transform them to replacement + characters. Since the point of keeping this data as a byte :class:`str` is + to keep the exact same bytes when you send it outside of your code, + changing things to replacement characters should be rasing red flags that + something is wrong. Setting :attr:`errors` to ``strict`` will raise an + exception which gives you an opportunity to fail gracefully. +4. Sometimes you will want to print out the values that you have in your byte + :class:`str`. When you do this you will need to make sure that you + transform :class:`unicode` to :class:`str` before combining them. Also be + sure that any other function calls (including :mod:`gettext`) are going to + give you strings that are the same type. For instance:: + + print to_bytes(_('Username: %(user)s'), 'utf-8') % {'user': b_username} + +Gotchas and how to avoid them +============================= + +Even when you have a good conceptual understanding of how python2 treats +:class:`unicode` and :class:`str` there are still some things that can +surprise you. In most cases this is because, as noted earlier, python or one +of the python libraries you depend on is trying to convert a value +automatically and failing. Explicit conversion at the appropriate place +usually solves that. + +str(obj) +-------- + +One common idiom for getting a simple, string representation of an object is to use:: + + str(obj) + +Unfortunately, this is not safe. Sometimes str(obj) will return +:class:`unicode`. Sometimes it will return a byte :class:`str`. Sometimes, +it will attempt to convert from a :class:`unicode` string to a byte +:class:`str`, fail, and throw a :exc:`UnicodeError`. To be safe from all of +these, first decide whether you need :class:`unicode` or :class:`str` to be +returned. Then use :func:`to_unicode` or :func:`to_bytes` to get the simple +representation like this:: + + u_representation = to_unicode(obj, nonstring='simplerepr') + b_representation = to_bytes(obj, nonstring='simplerepr') + +print +----- + +python has a builtin :func:`print` statement that outputs strings to the +terminal. This originated in a time when python only dealt with byte +:class:`str`. When :class:`unicode` strings came about, some enhancements +were made to the :func:`print` statement so that it could print those as well. +The enhancements make :func:`print` work most of the time. However, the times +when it doesn't work tend to make for cryptic debugging. + +The basic issue is that :func:`print` has to figure out what encoding to use +when it prints a :class:`unicode` string to the terminal. When python is +attached to your terminal (ie, you're running the interpreter or running +a script that prints to the screen) python is able to take the encoding value +from your locale settings :envvar:`LC_ALL` or :envvar:`LC_CTYPE` and print the +characters allowed by that encoding. On most modern Unix systems, the +encoding is :term:`utf-8` which means that you can print any :class:`unicode` +character without problem. + +There are two common cases of things going wrong: + +1. Someone has a locale set that does not accept all valid unicode characters. + For instance:: + + $ LC_ALL=C python + >>> print u'\ufffd' + Traceback (most recent call last): + File "", line 1, in + UnicodeEncodeError: 'ascii' codec can't encode character u'\ufffd' in position 0: ordinal not in range(128) + + This often happens when a script that you've written and debugged from the + terminal is run from an automated environment like :program:`cron`. It + also occurs when you have written a script using a :term:`utf-8` aware + locale and released it for consumption by people all over the internet. + Inevitably, someone is running with a locale that can't handle all unicode + characters and you get a traceback reported. +2. You redirect output to a file. Python isn't using the values in + :envvar:`LC_ALL` unconditionally to decide what encoding to use. Instead + it is using the encoding set for the terminal you are printing to which is + set to accept different encodings by :envvar:`LC_ALL`. If you redirect + to a file, you are no longer printing to the terminal so :envvar:`LC_ALL` + won't have any effect. At this point, python will decide it can't find an + encoding and fallback to :term:`ASCII` which will likely lead to + :exc:`UnicodeError` being raised. You can see this in a short script:: + + #! /usr/bin/python -tt + print u'\ufffd' + + And then look at the difference between running it normally and redirecting to a file: + + .. code-block:: console + + $ ./test.py + � + $ ./test.py > t + Traceback (most recent call last): + File "test.py", line 3, in + print u'\ufffd' + UnicodeEncodeError: 'ascii' codec can't encode character u'\ufffd' in position 0: ordinal not in range(128) + +The short answer to dealing with this is to always use bytes when writing +output. You can do this by explicitly converting to bytes like this:: + + from kitchen.text.converters import to_bytes + u_string = u'\ufffd' + print to_bytes(u_string) + +or you can wrap stdout and stderr with a :class:`~codecs.StreamWriter`. +A :class:`~codecs.StreamWriter` is convenient in that you can assign it to +encode for :data:`sys.stdout` or :data:`sys.stderr` and then have output +automatically converted but it has the drawback of still being able to throw +:exc:`UnicodeError` if the writer can't encode all possible unicode +codepoints. Kitchen provides an alternate version which can be retrieved with +:func:`kitchen.text.converters.getwriter` which will not traceback in its +standard configuration. + +.. _unicode-and-dict-keys: + +Unicode, str, and dict keys +--------------------------- + +The :func:`hash` of the :term:`ASCII` characters is the same for +:class:`unicode` and byte :class:`str`. When you use them in :class:`dict` +keys, they evaluate to the same dictionary slot:: + + >>> u_string = u'a' + >>> b_string = 'a' + >>> hash(u_string), hash(b_string) + (12416037344, 12416037344) + >>> d = {} + >>> d[u_string] = 'unicode' + >>> d[b_string] = 'bytes' + >>> d + {u'a': 'bytes'} + +When you deal with key values outside of :term:`ASCII`, :class:`unicode` and +byte :class:`str` evaluate unequally no matter what their character content or +hash value:: + + >>> u_string = u'ñ' + >>> b_string = u_string.encode('utf-8') + >>> print u_string + ñ + >>> print b_string + ñ + >>> d = {} + >>> d[u_string] = 'unicode' + >>> d[b_string] = 'bytes' + >>> d + {u'\\xf1': 'unicode', '\\xc3\\xb1': 'bytes'} + >>> b_string2 = '\\xf1' + >>> hash(u_string), hash(b_string2) + (30848092528, 30848092528) + >>> d = {} + >>> d[u_string] = 'unicode' + >>> d[b_string2] = 'bytes' + {u'\\xf1': 'unicode', '\\xf1': 'bytes'} + +How do you work with this one? Remember rule #1: Keep your :class:`unicode` +and byte :class:`str` values separate. That goes for keys in a dictionary +just like anything else. + +* For any given dictionary, make sure that all your keys are either + :class:`unicode` or :class:`str`. **Do not mix the two.** If you're being + given both :class:`unicode` and :class:`str` but you don't need to preserve + separate keys for each, I recommend using :func:`to_unicode` or + :func:`to_bytes` to convert all keys to one type or the other like this:: + + >>> from kitchen.text.converters import to_unicode + >>> u_string = u'one' + >>> b_string = 'two' + >>> d = {} + >>> d[to_unicode(u_string)] = 1 + >>> d[to_unicode(b_string)] = 2 + >>> d + {u'two': 2, u'one': 1} + +* These issues also apply to using dicts with tuple keys that contain + a mixture of :class:`unicode` and :class:`str`. Once again the best fix + is to standardise on either :class:`str` or :class:`unicode`. + +* If you absolutely need to store values in a dictionary where the keys could + be either :class:`unicode` or :class:`str` you can use + :class:`~kitchen.collections.strictdict.StrictDict` which has separate + entries for all :class:`unicode` and byte :class:`str` and deals correctly + with any :class:`tuple` containing mixed :class:`unicode` and byte + :class:`str`. + +--------- +Functions +--------- + +Unicode and byte str conversion +=============================== + +.. autofunction:: kitchen.text.converters.to_unicode +.. autofunction:: kitchen.text.converters.to_bytes +.. autofunction:: kitchen.text.converters.getwriter +.. autofunction:: kitchen.text.converters.to_str +.. autofunction:: kitchen.text.converters.to_utf8 + +Transformation to XML +===================== + +.. autofunction:: kitchen.text.converters.unicode_to_xml +.. autofunction:: kitchen.text.converters.xml_to_unicode +.. autofunction:: kitchen.text.converters.byte_string_to_xml +.. autofunction:: kitchen.text.converters.xml_to_byte_string +.. autofunction:: kitchen.text.converters.bytes_to_xml +.. autofunction:: kitchen.text.converters.xml_to_bytes +.. autofunction:: kitchen.text.converters.guess_encoding_to_xml +.. autofunction:: kitchen.text.converters.to_xml + +Working with exception messages +=============================== + +.. autodata:: kitchen.text.converters.EXCEPTION_CONVERTERS +.. autodata:: kitchen.text.converters.BYTE_EXCEPTION_CONVERTERS +.. autofunction:: kitchen.text.converters.exception_to_unicode +.. autofunction:: kitchen.text.converters.exception_to_bytes diff --git a/docs/api-text-display.rst b/docs/api-text-display.rst new file mode 100644 index 0000000..f15c7f7 --- /dev/null +++ b/docs/api-text-display.rst @@ -0,0 +1,33 @@ +.. automodule:: kitchen.text.display + +.. autofunction:: kitchen.text.display.textual_width + +.. autofunction:: kitchen.text.display.textual_width_chop + +.. autofunction:: kitchen.text.display.textual_width_fill + +.. autofunction:: kitchen.text.display.wrap + +.. autofunction:: kitchen.text.display.fill + +.. autofunction:: kitchen.text.display.byte_string_textual_width_fill + +Internal Data +============= + +There are a few internal functions and variables in this module. Code outside +of kitchen shouldn't use them but people coding on kitchen itself may find +them useful. + +.. autodata:: kitchen.text.display._COMBINING + +.. autofunction:: kitchen.text.display._generate_combining_table + +.. autofunction:: kitchen.text.display._print_combining_table + +.. autofunction:: kitchen.text.display._interval_bisearch + +.. autofunction:: kitchen.text.display._ucp_width + +.. autofunction:: kitchen.text.display._textual_width_le + diff --git a/docs/api-text-misc.rst b/docs/api-text-misc.rst new file mode 100644 index 0000000..94cd4f2 --- /dev/null +++ b/docs/api-text-misc.rst @@ -0,0 +1,2 @@ +.. automodule:: kitchen.text.misc + :members: diff --git a/docs/api-text-utf8.rst b/docs/api-text-utf8.rst new file mode 100644 index 0000000..576d189 --- /dev/null +++ b/docs/api-text-utf8.rst @@ -0,0 +1,3 @@ +.. automodule:: kitchen.text.utf8 + :members: + :deprecated: diff --git a/docs/api-text.rst b/docs/api-text.rst new file mode 100644 index 0000000..7b6051a --- /dev/null +++ b/docs/api-text.rst @@ -0,0 +1,22 @@ +============================================= +Kitchen.text: unicode and utf8 and xml oh my! +============================================= + +The kitchen.text module contains functions that deal with text manipulation. + +.. toctree:: + + api-text-converters + api-text-display + api-text-misc + api-text-utf8 + +:mod:`~kitchen.text.converters` + deals with converting text for different encodings and to and from XML +:mod:`~kitchen.text.display` + deals with issues with printing text to a screen +:mod:`~kitchen.text.misc` + is a catchall for text manipulation functions that don't seem to fit + elsewhere +:mod:`~kitchen.text.utf8` + contains deprecated functions to manipulate utf8 byte strings diff --git a/docs/api-versioning.rst b/docs/api-versioning.rst new file mode 100644 index 0000000..4100bbb --- /dev/null +++ b/docs/api-versioning.rst @@ -0,0 +1,6 @@ +=============================== +Helpers for versioning software +=============================== + +.. automodule:: kitchen.versioning + :members: diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..5885d48 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +# +# Kitchen documentation build configuration file, created by +# sphinx-quickstart on Sat May 22 00:51:26 2010. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +import kitchen.release + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.pngmath', 'sphinx.ext.ifconfig'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = kitchen.release.NAME +copyright = kitchen.release.COPYRIGHT + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.2' +# The full version, including alpha/beta/rc tags. +release = kitchen.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +language = 'en' + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +show_authors = True + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +highlight_language = 'python' + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Content template for the index page. +html_index = 'index.html' + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +html_use_opensearch = kitchen.release.DOWNLOAD_URL + 'docs/' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'kitchendoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'kitchen.tex', u'kitchen Documentation', + u'Toshio Kuratomi', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + +automodule_skip_lines = 4 +autoclass_content = "class" + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None, + 'https://fedorahosted.org/releases/p/y/python-fedora/doc/': None, + 'https://fedorahosted.org/releases/p/a/packagedb/doc/': None} + +rst_epilog = ''' +.. |projpage| replace:: project webpage +.. _projpage: %(url)s +.. |docpage| replace:: documentation page +.. _docpage: %(download)s/docs +.. |downldpage| replace:: download page +.. _downldpage: %(download)s +.. |stdlib| replace:: python standard library +.. _stdlib: http://docs.python.org/library +''' % {'url': kitchen.release.URL, 'download': kitchen.release.DOWNLOAD_URL} diff --git a/docs/designing-unicode-apis.rst b/docs/designing-unicode-apis.rst new file mode 100644 index 0000000..24265fb --- /dev/null +++ b/docs/designing-unicode-apis.rst @@ -0,0 +1,690 @@ +.. _DesigningUnicodeAwareAPIs: + +============================ +Designing Unicode Aware APIs +============================ + +APIs that deal with byte :class:`str` and :class:`unicode` strings are +difficult to get right. Here are a few strategies with pros and cons of each. + +.. contents:: + +------------------------------------------------- +Take either bytes or unicode, output only unicode +------------------------------------------------- + +In this strategy, you allow the user to enter either :class:`unicode` strings +or byte :class:`str` but what you give back is always :class:`unicode`. This +strategy is easy for novice endusers to start using immediately as they will +be able to feed either type of string into the function and get back a string +that they can use in other places. + +However, it does lead to the novice writing code that functions correctly when +testing it with :term:`ASCII`-only data but fails when given data that contains +non-:term:`ASCII` characters. Worse, if your API is not designed to be +flexible, the consumer of your code won't be able to easily correct those +problems once they find them. + +Here's a good API that uses this strategy:: + + from kitchen.text.converters import to_unicode + + def truncate(msg, max_length, encoding='utf8', errors='replace'): + msg = to_unicode(msg, encoding, errors) + return msg[:max_length] + +The call to :func:`truncate` starts with the essential parameters for +performing the task. It ends with two optional keyword arguments that define +the encoding to use to transform from a byte :class:`str` to :class:`unicode` +and the strategy to use if undecodable bytes are encountered. The defaults +may vary depending on the use cases you have in mind. When the output is +generally going to be printed for the user to see, ``errors='replace'`` is +a good default. If you are constructing keys to a database, raisng an +exception (with ``errors='strict'``) may be a better default. In either case, +having both parameters allows the person using your API to choose how they +want to handle any problems. Having the values is also a clue to them that +a conversion from byte :class:`str` to :class:`unicode` string is going to +occur. + +.. note:: + + If you're targeting python-3.1 and above, ``errors='surrogateescape'`` may + be a better default than ``errors='strict'``. You need to be mindful of + a few things when using ``surrogateescape`` though: + + * ``surrogateescape`` will cause issues if a non-:term:`ASCII` compatible + encoding is used (for instance, UTF-16 and UTF-32.) That makes it + unhelpful in situations where a true general purpose method of encoding + must be found. :pep:`383` mentions that ``surrogateescape`` was + specifically designed with the limitations of translating using system + locales (where :term:`ASCII` compatibility is generally seen as + inescapable) so you should keep that in mind. + * If you use ``surrogateescape`` to decode from :class:`bytes` + to :class:`unicode` you will need to use an error handler other than + ``strict`` to encode as the lone surrogate that this error handler + creates makes for invalid unicode that must be handled when encoding. + In Python-3.1.2 or less, a bug in the encoder error handlers mean that + you can only use ``surrogateescape`` to encode; anything else will throw + an error. + + Evaluate your usages of the variables in question to see what makes sense. + +Here's a bad example of using this strategy:: + + from kitchen.text.converters import to_unicode + + def truncate(msg, max_length): + msg = to_unicode(msg) + return msg[:max_length] + +In this example, we don't have the optional keyword arguments for +:attr:`encoding` and :attr:`errors`. A user who uses this function is more +likely to miss the fact that a conversion from byte :class:`str` to +:class:`unicode` is going to occur. And once an error is reported, they will +have to look through their backtrace and think harder about where they want to +transform their data into :class:`unicode` strings instead of having the +opportunity to control how the conversion takes place in the function itself. +Note that the user does have the ability to make this work by making the +transformation to unicode themselves:: + + from kitchen.text.converters import to_unicode + + msg = to_unicode(msg, encoding='euc_jp', errors='ignore') + new_msg = truncate(msg, 5) + +-------------------------------------------------- +Take either bytes or unicode, output the same type +-------------------------------------------------- + +This strategy is sometimes called polymorphic because the type of data that is +returned is dependent on the type of data that is received. The concept is +that when you are given a byte :class:`str` to process, you return a byte +:class:`str` in your output. When you are given :class:`unicode` strings to +process, you return :class:`unicode` strings in your output. + +This can work well for end users as the ones that know about the difference +between the two string types will already have transformed the strings to +their desired type before giving it to this function. The ones that don't can +remain blissfully ignorant (at least, as far as your function is concerned) as +the function does not change the type. + +In cases where the encoding of the byte :class:`str` is known or can be +discovered based on the input data this works well. If you can't figure out +the input encoding, however, this strategy can fail in any of the following +cases: + +1. It needs to do an internal conversion between byte :class:`str` and + :class:`unicode` string. +2. It cannot return the same data as either a :class:`unicode` string or byte + :class:`str`. +3. You may need to deal with byte strings that are not byte-compatible with + :term:`ASCII` + +First, a couple examples of using this strategy in a good way:: + + def translate(msg, table): + replacements = table.keys() + new_msg = [] + for index, char in enumerate(msg): + if char in replacements: + new_msg.append(table[char]) + else: + new_msg.append(char) + + return ''.join(new_msg) + +In this example, all of the strings that we use (except the empty string which +is okay because it doesn't have any characters to encode) come from outside of +the function. Due to that, the user is responsible for making sure that the +:attr:`msg`, and the keys and values in :attr:`table` all match in terms of +type (:class:`unicode` vs :class:`str`) and encoding (You can do some error +checking to make sure the user gave all the same type but you can't do the +same for the user giving different encodings). You do not need to make +changes to the string that require you to know the encoding or type of the +string; everything is a simple replacement of one element in the array of +characters in message with the character in table. + +:: + + import json + from kitchen.text.converters import to_unicode, to_bytes + + def first_field_from_json_data(json_string): + '''Return the first field in a json data structure. + + The format of the json data is a simple list of strings. + '["one", "two", "three"]' + ''' + if isinstance(json_string, unicode): + # On all python versions, json.loads() returns unicode if given + # a unicode string + return json.loads(json_string)[0] + + # Byte str: figure out which encoding we're dealing with + if '\x00' not in json_data[:2] + encoding = 'utf8' + elif '\x00\x00\x00' == json_data[:3]: + encoding = 'utf-32-be' + elif '\x00\x00\x00' == json_data[1:4]: + encoding = 'utf-32-le' + elif '\x00' == json_data[0] and '\x00' == json_data[2]: + encoding = 'utf-16-be' + else: + encoding = 'utf-16-le' + + data = json.loads(unicode(json_string, encoding)) + return data[0].encode(encoding) + +In this example the function takes either a byte :class:`str` type or +a :class:`unicode` string that has a list in json format and returns the first +field from it as the type of the input string. The first section of code is +very straightforward; we receive a :class:`unicode` string, parse it with +a function, and then return the first field from our parsed data (which our +function returned to us as json data). + +The second portion that deals with byte :class:`str` is not so +straightforward. Before we can parse the string we have to determine what +characters the bytes in the string map to. If we didn't do that, we wouldn't +be able to properly find which characters are present in the string. In order +to do that we have to figure out the encoding of the byte :class:`str`. +Luckily, the json specification states that all strings are unicode and +encoded with one of UTF32be, UTF32le, UTF16be, UTF16le, or :term:`UTF-8`. It further +defines the format such that the first two characters are always +:term:`ASCII`. Each of these has a different sequence of NULLs when they +encode an :term:`ASCII` character. We can use that to detect which encoding +was used to create the byte :class:`str`. + +Finally, we return the byte :class:`str` by encoding the :class:`unicode` back +to a byte :class:`str`. + +As you can see, in this example we have to convert from byte :class:`str` to +:class:`unicode` and back. But we know from the json specification that byte +:class:`str` has to be one of a limited number of encodings that we are able +to detect. That ability makes this strategy work. + +Now for some examples of using this strategy in ways that fail:: + + import unicodedata + def first_char(msg): + '''Return the first character in a string''' + if not isinstance(msg, unicode): + try: + msg = unicode(msg, 'utf8') + except UnicodeError: + msg = unicode(msg, 'latin1') + msg = unicodedata.normalize('NFC', msg) + return msg[0] + +If you look at that code and think that there's something fragile and prone to +breaking in the ``try: except:`` block you are correct in being suspicious. +This code will fail on multi-byte character sets that aren't :term:`UTF-8`. It +can also fail on data where the sequence of bytes is valid :term:`UTF-8` but +the bytes are actually of a different encoding. The reasons this code fails +is that we don't know what encoding the bytes are in and the code must convert +from a byte :class:`str` to a :class:`unicode` string in order to function. + +In order to make this code robust we must know the encoding of :attr:`msg`. +The only way to know that is to ask the user so the API must do that:: + + import unicodedata + def number_of_chars(msg, encoding='utf8', errors='strict'): + if not isinstance(msg, unicode): + msg = unicode(msg, encoding, errors) + msg = unicodedata.normalize('NFC', msg) + return len(msg) + +Another example of failure:: + + import os + def listdir(directory): + files = os.listdir(directory) + if isinstance(directory, str): + return files + # files could contain both bytes and unicode + new_files = [] + for filename in files: + if not isinstance(filename, unicode): + # What to do here? + continue + new_files.appen(filename) + return new_files + +This function illustrates the second failure mode. Here, not all of the +possible values can be represented as :class:`unicode` without knowing more +about the encoding of each of the filenames involved. Since each filename +could have a different encoding there's a few different options to pursue. We +could make this function always return byte :class:`str` since that can +accurately represent anything that could be returned. If we want to return +:class:`unicode` we need to at least allow the user to specify what to do in +case of an error decoding the bytes to :class:`unicode`. We can also let the +user specify the encoding to use for doing the decoding but that won't help in +all cases since not all files will be in the same encoding (or even +necessarily in any encoding):: + + import locale + import os + def listdir(directory, encoding=locale.getpreferredencoding(), errors='strict'): + # Note: In python-3.1+, surrogateescape may be a better default + files = os.listdir(directory) + if isinstance(directory, str): + return files + new_files = [] + for filename in files: + if not isinstance(filename, unicode): + filename = unicode(filename, encoding=encoding, errors=errors) + new_files.append(filename) + return new_files + +Note that although we use :attr:`errors` in this example as what to pass to +the codec that decodes to :class:`unicode` we could also have an +:attr:`errors` argument that decides other things to do like skip a filename +entirely, return a placeholder (``Nondisplayable filename``), or raise an +exception. + +This leaves us with one last failure to describe:: + + def first_field(csv_string): + '''Return the first field in a comma separated values string.''' + try: + return csv_string[:csv_string.index(',')] + except ValueError: + return csv_string + +This code looks simple enough. The hidden error here is that we are searching +for a comma character in a byte :class:`str` but not all encodings will use +the same sequence of bytes to represent the comma. If you use an encoding +that's not :term:`ASCII` compatible on the byte level, then the literal comma +``','`` in the above code will match inappropriate bytes. Some examples of +how it can fail: + +* Will find the byte representing an :term:`ASCII` comma in another character +* Will find the comma but leave trailing garbage bytes on the end of the + string +* Will not match the character that represents the comma in this encoding + +There are two ways to solve this. You can either take the encoding value from +the user or you can take the separator value from the user. Of the two, +taking the encoding is the better option for two reasons: + +1. Taking a separator argument doesn't clearly document for the API user that + the reason they must give it is to properly match the encoding of the + :attr:`csv_string`. They're just as likely to think that it's simply a way + to specify an alternate character (like ":" or "|") for the separator. +2. It's possible for a variable width encoding to reuse the same byte sequence + for different characters in multiple sequences. + + .. note:: + + :term:`UTF-8` is resistant to this as any character's sequence of + bytes will never be a subset of another character's sequence of bytes. + +With that in mind, here's how to improve the API:: + + def first_field(csv_string, encoding='utf-8', errors='replace'): + if not isinstance(csv_string, unicode): + u_string = unicode(csv_string, encoding, errors) + is_unicode = False + else: + u_string = csv_string + + try: + field = u_string[:U_string.index(u',')] + except ValueError: + return csv_string + + if not is_unicode: + field = field.encode(encoding, errors) + return field + +.. note:: + + If you decide you'll never encounter a variable width encoding that reuses + byte sequences you can use this code instead:: + + def first_field(csv_string, encoding='utf-8'): + try: + return csv_string[:csv_string.index(','.encode(encoding))] + except ValueError: + return csv_string + +------------------ +Separate functions +------------------ + +Sometimes you want to be able to take either byte :class:`str` or +:class:`unicode` strings, perform similar operations on either one and then +return data in the same format as was given. Probably the easiest way to do +that is to have separate functions for each and adopt a naming convention to +show that one is for working with byte :class:`str` and the other is for +working with :class:`unicode` strings:: + + def translate_b(msg, table): + '''Replace values in str with other byte values like unicode.translate''' + if not isinstance(msg, str): + raise TypeError('msg must be of type str') + str_table = [chr(s) for s in xrange(0,256)] + delete_chars = [] + for chr_val in (k for k in table.keys() if isinstance(k, int)): + if chr_val > 255: + raise ValueError('Keys in table must not exceed 255)') + if table[chr_val] == None: + delete_chars.append(chr(chr_val)) + elif isinstance(table[chr_val], int): + if table[chr_val] > 255: + raise TypeError('table values cannot be more than 255 or less than 0') + str_table[chr_val] = chr(table[chr_val]) + else: + if not isinstance(table[chr_val], str): + raise TypeError('character mapping must return integer, None or str') + str_table[chr_val] = table[chr_val] + str_table = ''.join(str_table) + delete_chars = ''.join(delete_chars) + return msg.translate(str_table, delete_chars) + + def translate(msg, table): + '''Replace values in a unicode string with other values''' + if not isinstance(msg, unicode): + raise TypeError('msg must be of type unicode') + return msg.translate(table) + +There's several things that we have to do in this API: + +* Because the function names might not be enough of a clue to the user of the + functions of the value types that are expected, we have to check that the + types are correct. + +* We keep the behaviour of the two functions as close to the same as possible, + just with byte :class:`str` and :class:`unicode` strings substituted for + each other. + + +----------------------------------------------------------------- +Deciding whether to take str or unicode when no value is returned +----------------------------------------------------------------- + +Not all functions have a return value. Sometimes a function is there to +interact with something external to python, for instance, writing a file out +to disk or a method exists to update the internal state of a data structure. +One of the main questions with these APIs is whether to take byte +:class:`str`, :class:`unicode` string, or both. The answer depends on your +use case but I'll give some examples here. + +Writing to external data +======================== + +When your information is going to an external data source like writing to +a file you need to decide whether to take in :class:`unicode` strings or byte +:class:`str`. Remember that most external data sources are not going to be +dealing with unicode directly. Instead, they're going to be dealing with +a sequence of bytes that may be interpreted as unicode. With that in mind, +you either need to have the user give you a byte :class:`str` or convert to +a byte :class:`str` inside the function. + +Next you need to think about the type of data that you're receiving. If it's +textual data, (for instance, this is a chat client and the user is typing +messages that they expect to be read by another person) it probably makes sense to +take in :class:`unicode` strings and do the conversion inside your function. +On the other hand, if this is a lower level function that's passing data into +a network socket, it probably should be taking byte :class:`str` instead. + +Just as noted in the API notes above, you should specify an :attr:`encoding` +and :attr:`errors` argument if you need to transform from :class:`unicode` +string to byte :class:`str` and you are unable to guess the encoding from the +data itself. + +Updating data structures +======================== + +Sometimes your API is just going to update a data structure and not +immediately output that data anywhere. Just as when writing external data, +you should think about both what your function is going to do with the data +eventually and what the caller of your function is thinking that they're +giving you. Most of the time, you'll want to take :class:`unicode` strings +and enter them into the data structure as :class:`unicode` when the data is +textual in nature. You'll want to take byte :class:`str` and enter them into +the data structure as byte :class:`str` when the data is not text. Use +a naming convention so the user knows what's expected. + +------------- +APIs to Avoid +------------- + +There are a few APIs that are just wrong. If you catch yourself making an API +that does one of these things, change it before anyone sees your code. + +Returning unicode unless a conversion fails +=========================================== + +This type of API usually deals with byte :class:`str` at some point and +converts it to :class:`unicode` because it's usually thought to be text. +However, there are times when the bytes fail to convert to a :class:`unicode` +string. When that happens, this API returns the raw byte :class:`str` instead +of a :class:`unicode` string. One example of this is present in the |stdlib|_: +python2's :func:`os.listdir`:: + + >>> import os + >>> import locale + >>> locale.getpreferredencoding() + 'UTF-8' + >>> os.mkdir('/tmp/mine') + >>> os.chdir('/tmp/mine') + >>> open('nonsense_char_\xff', 'w').close() + >>> open('all_ascii', 'w').close() + >>> os.listdir(u'.') + [u'all_ascii', 'nonsense_char_\xff'] + +The problem with APIs like this is that they cause failures that are hard to +debug because they don't happen where the variables are set. For instance, +let's say you take the filenames from :func:`os.listdir` and give it to this +function:: + + def normalize_filename(filename): + '''Change spaces and dashes into underscores''' + return filename.translate({ord(u' '):u'_', ord(u' '):u'_'}) + +When you test this, you use filenames that all are decodable in your preferred +encoding and everything seems to work. But when this code is run on a machine +that has filenames in multiple encodings the filenames returned by +:func:`os.listdir` suddenly include byte :class:`str`. And byte :class:`str` +has a different :func:`string.translate` function that takes different values. +So the code raises an exception where it's not immediately obvious that +:func:`os.listdir` is at fault. + +Ignoring values with no chance of recovery +========================================== + +An early version of python3 attempted to fix the :func:`os.listdir` problem +pointed out in the last section by returning all values that were decodable to +:class:`unicode` and omitting the filenames that were not. This lead to the +following output:: + + >>> import os + >>> import locale + >>> locale.getpreferredencoding() + 'UTF-8' + >>> os.mkdir('/tmp/mine') + >>> os.chdir('/tmp/mine') + >>> open(b'nonsense_char_\xff', 'w').close() + >>> open('all_ascii', 'w').close() + >>> os.listdir('.') + ['all_ascii'] + +The issue with this type of code is that it is silently doing something +surprising. The caller expects to get a full list of files back from +:func:`os.listdir`. Instead, it silently ignores some of the files, returning +only a subset. This leads to code that doesn't do what is expected that may +go unnoticed until the code is in production and someone notices that +something important is being missed. + +Raising a UnicodeException with no chance of recovery +===================================================== + +Believe it or not, a few libraries exist that make it impossible to deal +with unicode text without raising a :exc:`UnicodeError`. What seems to occur +in these libraries is that the library has functions that expect to receive +a :class:`unicode` string. However, internally, those functions call other +functions that expect to receive a byte :class:`str`. The programmer of the +API was smart enough to convert from a :class:`unicode` string to a byte +:class:`str` but they did not give the user the chance to specify the +encodings to use or how to deal with errors. This results in exceptions when +the user passes in a byte :class:`str` because the initial function wants +a :class:`unicode` string and exceptions when the user passes in +a :class:`unicode` string because the function can't convert the string to +bytes in the encoding that it's selected. + +Do not put the user in the position of not being able to use your API without +raising a :exc:`UnicodeError` with certain values. If you can only safely +take :class:`unicode` strings, document that byte :class:`str` is not allowed +and vice versa. If you have to convert internally, make sure to give the +caller of your function parameters to control the encoding and how to treat +errors that may occur during the encoding/decoding process. If your code will +raise a :exc:`UnicodeError` with non-:term:`ASCII` values no matter what, you +should probably rethink your API. + +----------------- +Knowing your data +----------------- + +If you've read all the way down to this section without skipping you've seen +several admonitions about the type of data you are processing affecting the +viability of the various API choices. + +Here's a few things to consider in your data: + +Do you need to operate on both bytes and unicode? +================================================= + +Much of the data in libraries, programs, and the general environment outside +of python is written where strings are sequences of bytes. So when we +interact with data that comes from outside of python or data that is about to +leave python it may make sense to only operate on the data as a byte +:class:`str`. There's two times when this may make sense: + +1. The user is intended to hand the data to the function and then the function + takes care of sending the data outside of python (to the filesystem, over + the network, etc). +2. The data is not representable as text. For instance, writing a binary + file format. + +Even when your code is operating in this area you still need to think a little +more about your data. For instance, it might make sense for the person using +your API to pass in :class:`unicode` strings and let the function convert that +into the byte :class:`str` that it then sends over the wire. + +There are also times when it might make sense to operate only on +:class:`unicode` strings. :class:`unicode` represents text so anytime that +you are working on textual data that isn't going to leave python it has the +potential to be a :class:`unicode`-only API. However, there's two things that +you should consider when designing a :class:`unicode`-only API: + +1. As your API gains popularity, people are going to use your API in places + that you may not have thought of. Corner cases in these other places may + mean that processing bytes is desirable. +2. In python2, byte :class:`str` and :class:`unicode` are often used + interchangably with each other. That means that people programming against + your API may have received :class:`str` from some other API and it would be + most convenient for their code if your API accepted it. + +.. note:: + + In python3, the separation between the text type and the byte type + are more clear. So in python3, there's less need to have all APIs take + both unicode and bytes. + +Can you restrict the encodings? +=============================== +If you determine that you have to deal with byte :class:`str` you should +realize that not all encodings are created equal. Each has different +properties that may make it possible to provide a simpler API provided that +you can reasonably tell the users of your API that they cannot use certain +classes of encodings. + +As one example, if you are required to find a comma (``,``) in a byte +:class:`str` you have different choices based on what encodings are allowed. +If you can reasonably restrict your API users to only giving :term:`ASCII +compatible` encodings you can do this simply by searching for the literal +comma character because that character will be represented by the same byte +sequence in all :term:`ASCII compatible` encodings. + +The following are some classes of encodings to be aware of as you decide how +generic your code needs to be. + +Single byte encodings +--------------------- + +Single byte encodings can only represent 256 total characters. They encode +the :term:`code points` for a character to the equivalent number in a single +byte. + +Most single byte encodings are :term:`ASCII compatible`. :term:`ASCII +compatible` encodings are the most likely to be usable without changes to code +so this is good news. A notable exception to this is the `EBDIC +`_ +family of encodings. + +Multibyte encodings +------------------- + +Multibyte encodings use more than one byte to encode some characters. + +Fixed width +~~~~~~~~~~~ + +Fixed width encodings have a set number of bytes to represent all of the +characters in the character set. ``UTF-32`` is an example of a fixed width +encoding that uses four bytes per character and can express every unicode +characters. There are a number of problems with writing APIs that need to +operate on fixed width, multibyte characters. To go back to our earlier +example of finding a comma in a string, we have to realize that even in +``UTF-32`` where the :term:`code point` for :term:`ASCII` characters is the +same as in :term:`ASCII`, the byte sequence for them is different. So you +cannot search for the literal byte character as it may pick up false +positives and may break a byte sequence in an odd place. + +Variable Width +~~~~~~~~~~~~~~ + +ASCII compatible +"""""""""""""""" + +:term:`UTF-8` and the `EUC `_ +family of encodings are examples of :term:`ASCII compatible` multi-byte +encodings. They achieve this by adhering to two principles: + +* All of the :term:`ASCII` characters are represented by the byte that they + are in the :term:`ASCII` encoding. +* None of the :term:`ASCII` byte sequences are reused in any other byte + sequence for a different character. + +Escaped +""""""" + +Some multibyte encodings work by using only bytes from the :term:`ASCII` +encoding but when a particular sequence of those byes is found, they are +interpreted as meaning something other than their :term:`ASCII` values. +``UTF-7`` is one such encoding that can encode all of the unicode +:term:`code points`. For instance, here's a some Japanese characters encoded as +``UTF-7``:: + + >>> a = u'\u304f\u3089\u3068\u307f' + >>> print a + くらとみ + >>> print a.encode('utf-7') + +ME8wiTBoMH8- + +These encodings can be used when you need to encode unicode data that may +contain non-:term:`ASCII` characters for inclusion in an :term:`ASCII` only +transport medium or file. + +However, they are not :term:`ASCII compatible` in the sense that we used +earlier as the bytes that represent a :term:`ASCII` character are being reused +as part of other characters. If you were to search for a literal plus sign in +this encoded string, you would run across many false positives, for instance. + +Other +""""" + +There are many other popular variable width encodings, for instance ``UTF-16`` +and ``shift-JIS``. Many of these are not :term:`ASCII compatible` so you +cannot search for a literal :term:`ASCII` character without danger of false +positives or false negatives. diff --git a/docs/glossary.rst b/docs/glossary.rst new file mode 100644 index 0000000..89451d4 --- /dev/null +++ b/docs/glossary.rst @@ -0,0 +1,107 @@ +======== +Glossary +======== + +.. glossary:: + + "Everything but the kitchen sink" + An English idiom meaning to include nearly everything that you can + think of. + + API version + Version that is meant for computer consumption. This version is + parsable and comparable by computers. It contains information about + a library's API so that computer software can decide whether it works + with the software. + + ASCII + A character encoding that maps numbers to characters essential to + American English. It maps 128 characters using 7bits. + + .. seealso:: http://en.wikipedia.org/wiki/ASCII + + ASCII compatible + An encoding in which the particular byte that maps to a character in + the :term:`ASCII` character set is only used to map to that character. + This excludes EBDIC based encodings and many multi-byte fixed and + variable width encodings since they reuse the bytes that make up the + :term:`ASCII` encoding for other purposes. :term:`UTF-8` is notable + as a variable width encoding that is :term:`ASCII` compatible. + + .. seealso:: + + http://en.wikipedia.org/wiki/Variable-width_encoding + For another explanation of various ways bytes are mapped to + characters in a possibly incompatible manner. + + code points + :term:`code point` + + code point + A number that maps to a particular abstract character. Code points + make it so that we have a number pointing to a character without + worrying about implementation details of how those numbers are stored + for the computer to read. Encodings define how the code points map to + particular sequences of bytes on disk and in memory. + + control characters + :term:`control character` + + control character + The set of characters in unicode that are used, not to display glyphs + on the screen, but to tell the display in program to do something. + + .. seealso:: http://en.wikipedia.org/wiki/Control_character + + grapheme + characters or pieces of characters that you might write on a page to + make words, sentences, or other pieces of text. + + .. seealso:: http://en.wikipedia.org/wiki/Grapheme + + I18N + I18N is an abbreviation for internationalization. It's often used to + signify the need to translate words, number and date formats, and + other pieces of data in a computer program so that it will work well + for people who speak another language than yourself. + + message catalogs + :term:`message catalog` + + message catalog + Message catalogs contain translations for user-visible strings that + are present in your code. Normally, you need to mark the strings to + be translated by wrapping them in one of several :mod:`gettext` + functions. The function serves two purposes: + + 1. It allows automated tools to find which strings are supposed to be + extracted for translation. + 2. The functions perform the translation when the program is running. + + .. seealso:: + `babel's documentation + `_ + for one method of extracting message catalogs from source + code. + + Murphy's Law + "Anything that can go wrong, will go wrong." + + .. seealso:: http://en.wikipedia.org/wiki/Murphy%27s_Law + + release version + Version that is meant for human consumption. This version is easy for + a human to look at to decide how a particular version relates to other + versions of the software. + + textual width + The amount of horizontal space a character takes up on a monospaced + screen. The units are number of character cells or columns that it + takes the place of. + + UTF-8 + A character encoding that maps all unicode :term:`code points` to a sequence + of bytes. It is compatible with :term:`ASCII`. It uses a variable + number of bytes to encode all of unicode. ASCII characters take one + byte. Characters from other parts of unicode take two to four bytes. + It is widespread as an encoding on the internet and in Linux. diff --git a/docs/hacking.rst b/docs/hacking.rst new file mode 100644 index 0000000..9fbad1a --- /dev/null +++ b/docs/hacking.rst @@ -0,0 +1,359 @@ +======================================= +Conventions for contributing to kitchen +======================================= + +----- +Style +----- + +* Strive to be :pep:`8` compliant +* Run `:command:`pylint` ` over the code and try to resolve most of its nitpicking + +------------------------ +Python 2.3 compatibility +------------------------ + +At the moment, we're supporting python-2.3 and above. Understand that there's +a lot of python features that we cannot use because of this. + +Sometimes modules in the |stdlib|_ can be added to kitchen so that they're +available. When we do that we need to be careful of several things: + +1. Keep the module in sync with the version in the python-2.x trunk. Use + :file:`maintainers/sync-copied-files.py` for this. +2. Sync the unittests as well as the module. +3. Be aware that not all modules are written to remain compatible with + Python-2.3 and might use python language features that were not present + then (generator expressions, relative imports, decorators, with, try: with + both except: and finally:, etc) These are not good candidates for + importing into kitchen as they require more work to keep synced. + +--------- +Unittests +--------- + +* At least smoketest your code (make sure a function will return expected + values for one set of inputs). +* Note that even 100% coverage is not a guarantee of working code! Good tests + will realize that you need to also give multiple inputs that test the code + paths of called functions that are outside of your code. Example:: + + def to_unicode(msg, encoding='utf8', errors='replace'): + return unicode(msg, encoding, errors) + + # Smoketest only. This will give 100% coverage for your code (it + # tests all of the code inside of to_unicode) but it leaves a lot of + # room for errors as it doesn't test all combinations of arguments + # that are then passed to the unicode() function. + + tools.ok_(to_unicode('abc') == u'abc') + + # Better -- tests now cover non-ascii characters and that error conditions + # occur properly. There's a lot of other permutations that can be + # added along these same lines. + tools.ok_(to_unicode(u'café', 'utf8', 'replace')) + tools.assert_raises(UnicodeError, to_unicode, [u'cafè ñunru'.encode('latin1')]) + +* We're using nose for unittesting. Rather than depend on unittest2 + functionality, use the functions that nose provides. +* Remember to maintain python-2.3 compatibility even in unittests. + +---------------------------- +Docstrings and documentation +---------------------------- + +We use sphinx to build our documentation. We use the sphinx autodoc extension +to pull docstrings out of the modules for API documentation. This means that +docstrings for subpackages and modules should follow a certain pattern. The +general structure is: + +* Introductory material about a module in the module's top level docstring. + + * Introductory material should begin with a level two title: an overbar and + underbar of '-'. + +* docstrings for every function. + + * The first line is a short summary of what the function does + * This is followed by a blank line + * The next lines are a `field list + _` giving + information about the function's signature. We use the keywords: + ``arg``, ``kwarg``, ``raises``, ``returns``, and sometimes ``rtype``. Use + these to describe all arguments, key word arguments, exceptions raised, + and return values using these. + + * Parameters that are ``kwarg`` should specify what their default + behaviour is. + +.. _kitchen-versioning: + +------------------ +Kitchen versioning +------------------ + +Currently the kitchen library is in early stages of development. While we're +in this state, the main kitchen library uses the following pattern for version +information: + +* Versions look like this:: + __version_info__ = ((0, 1, 2),) + __version__ = '0.1.2' + +* The Major version number remains at 0 until we decide to make the first 1.0 + release of kitchen. At that point, we're declaring that we have some + confidence that we won't need to break backwards compatibility for a while. +* The Minor version increments for any backwards incompatible API changes. + When this is updated, we reset micro to zero. +* The Micro version increments for any other changes (backwards compatible API + changes, pure bugfixes, etc). + +.. note:: + + Versioning is only updated for releases that generate sdists and new + uploads to the download directory. Usually we update the version + information for the library just before release. By contrast, we update + kitchen :ref:`subpackage-versioning` when an API change is made. When in + doubt, look at the version information in the last release. + +---- +I18N +---- + +All strings that are used as feedback for users need to be translated. +:mod:`kitchen` sets up several functions for this. :func:`_` is used for +marking things that are shown to users via print, GUIs, or other "standard" +methods. Strings for exceptions are marked with :func:`b_`. This function +returns a byte :class:`str` which is needed for use with exceptions:: + + from kitchen import _, b_ + + def print_message(msg, username): + print _('%(user)s, your message of the day is: %(message)s') % { + 'message': msg, 'user': username} + + raise Exception b_('Test message') + +This serves several purposes: + +* It marks the strings to be extracted by an xgettext-like program. +* :func:`_` is a function that will substitute available translations at + runtime. + +.. note:: + + By using the ``%()s with dict`` style of string formatting, we make this + string friendly to translators that may need to reorder the variables when + they're translating the string. + +`paver _` and `babel +_` are used to extract the strings. + +----------- +API updates +----------- + +Kitchen strives to have a long deprecation cycle so that people have time to +switch away from any APIs that we decide to discard. Discarded APIs should +raise a :exc:`DeprecationWarning` and clearly state in the warning message and +the docstring how to convert old code to use the new interface. An example of +deprecating a function:: + + import warnings + + from kitchen import _ + from kitchen.text.converters import to_bytes, to_unicode + from kitchen.text.new_module import new_function + + def old_function(param): + '''**Deprecated** + + This function is deprecated. Use + :func:`kitchen.text.new_module.new_function` instead. If you want + unicode strngs as output, switch to:: + + >>> from kitchen.text.new_module import new_function + >>> output = new_function(param) + + If you want byte strings, use:: + + >>> from kitchen.text.new_module import new_function + >>> from kitchen.text.converters import to_bytes + >>> output = to_bytes(new_function(param)) + ''' + warnings.warn(_('kitchen.text.old_function is deprecated. Use' + ' kitchen.text.new_module.new_function instead'), + DeprecationWarning, stacklevel=2) + + as_unicode = isinstance(param, unicode) + message = new_function(to_unicode(param)) + if not as_unicode: + message = to_bytes(message) + return message + +If a particular API change is very intrusive, it may be better to create a new +version of the subpackage and ship both the old version and the new version. + +--------- +NEWS file +--------- + +Update the :file:`NEWS` file when you make a change that will be visible to +the users. This is not a ChangeLog file so we don't need to list absolutely +everything but it should give the user an idea of how this version differs +from prior versions. API changes should be listed here explicitly. bugfixes +can be more general:: + + ----- + 0.2.0 + ----- + * Relicense to LGPLv2+ + * Add kitchen.text.format module with the following functions: + textual_width, textual_width_chop. + * Rename the kitchen.text.utils module to kitchen.text.misc. use of the + old names is deprecated but still available. + * bugfixes applied to kitchen.pycompat24.defaultdict that fixes some + tracebacks + +------------------- +Kitchen subpackages +------------------- + +Kitchen itself is a namespace. The kitchen sdist (tarball) provides certain +useful subpackages. + +.. seealso:: + + `Kitchen addon packages`_ + For information about subpackages not distributed in the kitchen sdist + that install into the kitchen namespace. + +.. _subpackage-versioning: + +Versioning +========== + +Each subpackage should have its own version information which is independent +of the other kitchen subpackages and the main kitchen library version. This is +used so that code that depends on kitchen APIs can check the version +information. The standard way to do this is to put something like this in the +subpackage's :file:`__init__.py`:: + + from kitchen.versioning import version_tuple_to_string + + __version_info__ = ((1, 0, 0),) + __version__ = version_tuple_to_string(__version_info__) + +:attr:`__version_info__` is documented in :mod:`kitchen.versioning`. The +values of the first tuple should describe API changes to the module. There +are at least three numbers present in the tuple: (Major, minor, micro). The +major version number is for backwards incompatible changes (For +instance, removing a function, or adding a new mandatory argument to +a function). Whenever one of these occurs, you should increment the major +number and reset minor and micro to zero. The second number is the minor +version. Anytime new but backwards compatible changes are introduced this +number should be incremented and the micro version number reset to zero. The +micro version should be incremented when a change is made that does not change +the API at all. This is a common case for bugfixes, for instance. + +Version information beyond the first three parts of the first tuple may be +useful for versioning but semantically have similar meaning to the micro +version. + +.. note:: + + We update the :attr:`__version_info__` tuple when the API is updated. + This way there's less chance of forgetting to update the API version when + a new release is made. However, we try to only increment the version + numbers a single step for any release. So if kitchen-0.1.0 has + kitchen.text.__version__ == '1.0.1', kitchen-0.1.1 should have + kitchen.text.__version__ == '1.0.2' or '1.1.0' or '2.0.0'. + +Criteria for subpackages in kitchen +=================================== + +Supackages within kitchen should meet these criteria: + +* Generally useful or needed for other pieces of kitchen. + +* No mandatory requirements outside of the |stdlib|_. + + * Optional requirements from outside the |stdlib|_ are allowed. Things with + mandatory requirements are better placed in `kitchen addon packages`_ + +* Somewhat API stable -- this is not a hard requirement. We can change the + kitchen api. However, it is better not to as people may come to depend on + it. + + .. seealso:: + + `API Updates`_ + +---------------------- +Kitchen addon packages +---------------------- + +Addon packages are very similar to subpackages integrated into the kitchen +sdist. This section just lists some of the differences to watch out for. + +setup.py +======== + +Your :file:`setup.py` should contain entries like this:: + + # It's suggested to use a dotted name like this so the package is easily + # findable on pypi: + setup(name='kitchen.config', + # Include kitchen in the keywords, again, for searching on pypi + keywords=['kitchen', 'configuration'], + # This package lives in the directory kitchen/config + packages=['kitchen.config'], + # [...] + ) + +Package directory layout +======================== + +Create a :file:`kitchen` directory in the toplevel. Place the addon +subpackage in there. For example:: + + ./ <== toplevel with README, setup.py, NEWS, etc + kitchen/ + kitchen/__init__.py + kitchen/config/ <== subpackage directory + kitchen/config/__init__.py + +Fake kitchen module +=================== + +The :file::`__init__.py` in the :file:`kitchen` directory is special. It +won't be installed. It just needs to pull in the kitchen from the system so +that you are able to test your module. You should be able to use this +boilerplate:: + + # Fake module. This is not installed, It's just made to import the real + # kitchen modules for testing this module + import pkgutil + + # Extend the __path__ with everything in the real kitchen module + __path__ = pkgutil.extend_path(__path__, __name__) + +.. note:: + + :mod:`kitchen` needs to be findable by python for this to work. Installed + in the :file:`site-packages` directory or adding it to the + :envvar:`PYTHONPATH` will work. + +Your unittests should now be able to find both your submodule and the main +kitchen module. + +Versioning +========== + +It is recommended that addon packages version similarly to +:ref:`subpackage-versioning`. The :data:`__version_info__` and +:data:`__version__` strings can be changed independently of the version +exposed by setup.py so that you have both an API version +(:data:`__version_info__`) and release version that's easier for people to +parse. However, you aren't required to do this and you could follow +a different methodology if you want (for instance, :ref:`kitchen-versioning`) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..e14f0da --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,142 @@ +================================ +Kitchen, everything but the sink +================================ + +:Author: Toshio Kuratomi +:Date: 19 March 2011 +:Version: 1.0.x + +We've all done it. In the process of writing a brand new application we've +discovered that we need a little bit of code that we've invented before. +Perhaps it's something to handle unicode text. Perhaps it's something to make +a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up being +a tiny bit of code that seems too small to worry about pushing into its own +module so it sits there, a part of your current project, waiting to be cut and +pasted into your next project. And the next. And the next. And since that +little bittybit of code proved so useful to you, it's highly likely that it +proved useful to someone else as well. Useful enough that they've written it +and copy and pasted it over and over into each of their new projects. + +Well, no longer! Kitchen aims to pull these small snippets of code into a few +python modules which you can import and use within your project. No more copy +and paste! Now you can let someone else maintain and release these small +snippets so that you can get on with your life. + +This package forms the core of Kitchen. It contains some useful modules for +using newer |stdlib|_ modules on older python versions, text manipulation, +:pep:`386` versioning, and initializing :mod:`gettext`. With this package we're +trying to provide a few useful features that don't have too many dependencies +outside of the |stdlib|_. We'll be releasing other modules that drop into the +kitchen namespace to add other features (possibly with larger deps) as time +goes on. + +------------ +Requirements +------------ + +We've tried to keep the core kitchen module's requirements lightweight. At the +moment kitchen only requires + +:python: 2.3.1 or later + +.. warning:: Kitchen-1.1.0 is likely to be the last release that supports + python-2.3.x. Future releases will target python-2.4 as the minimum + required version. + +Soft Requirements +================= + +If found, these libraries will be used to make the implementation of some part +of kitchen better in some way. If they are not present, the API that they +enable will still exist but may function in a different manner. + +`chardet `_ + Used in :func:`~kitchen.text.misc.guess_encoding` and + :func:`~kitchen.text.converters.guess_encoding_to_xml` to help guess + encoding of byte strings being converted. If not present, unknown + encodings will be converted as if they were ``latin1`` + +--------------------------- +Other Recommended Libraries +--------------------------- + +These libraries implement commonly used functionality that everyone seems to +invent. Rather than reinvent their wheel, I simply list the things that they +do well for now. Perhaps if people can't find them normally, I'll add them as +requirements in :file:`setup.py` or link them into kitchen's namespace. For +now, I just mention them here: + +`bunch `_ + Bunch is a dictionary that you can use attribute lookup as well as bracket + notation to access. Setting it apart from most homebrewed implementations + is the :func:`bunchify` function which will descend nested structures of + lists and dicts, transforming the dicts to Bunch's. +`hashlib `_ + Python 2.5 and forward have a :mod:`hashlib` library that provides secure + hash functions to python. If you're developing for python2.3 or + python2.4, though, you can install the standalone hashlib library and have + access to the same functions. +`iterutils `_ + The python documentation for :mod:`itertools` has some examples + of other nice iterable functions that can be built from the + :mod:`itertools` functions. This third-party module creates those recipes + as a module. +`ordereddict `_ + Python 2.7 and forward have a :mod:`~collections.OrderedDict` that + provides a :class:`dict` whose items are ordered (and indexable) as well + as named. +`unittest2 `_ + Python 2.7 has an updated :mod:`unittest` library with new functions not + present in the |stdlib|_ for Python 2.6 or less. If you want to use those + new functions but need your testing framework to be compatible with older + Python the unittest2 library provides the update as an external module. +`nose `_ + If you want to use a test discovery tool instead of the unittest + framework, nosetests provides a simple to use way to do that. + +------- +License +------- + +This python module is distributed under the terms of the +`GNU Lesser General Public License Version 2 or later +`_. + +.. note:: Some parts of this module are licensed under terms less restrictive + than the LGPLv2+. If you separate these files from the work as a whole + you are allowed to use them under the less restrictive licenses. The + following is a list of the files that are known: + + `Python 2 license `_ + :file:`_subprocess.py`, :file:`test_subprocess.py`, + :file:`defaultdict.py`, :file:`test_defaultdict.py`, + :file:`_base64.py`, and :file:`test_base64.py` + +-------- +Contents +-------- + +.. toctree:: + :maxdepth: 2 + + tutorial + api-overview + porting-guide-0.3 + hacking + glossary + +------------------ +Indices and tables +------------------ + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +------------- +Project Pages +------------- + +More information about the project can be found on the |projpage|_ + +The latest published version of this documentation can be found on the |docpage|_ diff --git a/docs/porting-guide-0.3.rst b/docs/porting-guide-0.3.rst new file mode 100644 index 0000000..3378f07 --- /dev/null +++ b/docs/porting-guide-0.3.rst @@ -0,0 +1,209 @@ +=================== +1.0.0 Porting Guide +=================== + +The 0.1 through 1.0.0 releases focused on bringing in functions from yum and +python-fedora. This porting guide tells how to port from those APIs to their +kitchen replacements. + +------------- +python-fedora +------------- + +=================================== =================== +python-fedora kitchen replacement +----------------------------------- ------------------- +:func:`fedora.iterutils.isiterable` :func:`kitchen.iterutils.isiterable` [#f1]_ +:func:`fedora.textutils.to_unicode` :func:`kitchen.text.converters.to_unicode` +:func:`fedora.textutils.to_bytes` :func:`kitchen.text.converters.to_bytes` +=================================== =================== + +.. [#f1] :func:`~kitchen.iterutils.isiterable` has changed slightly in + kitchen. The :attr:`include_string` attribute has switched its default value + from :data:`True` to :data:`False`. So you need to change code like:: + + >>> # Old code + >>> isiterable('abcdef') + True + >>> # New code + >>> isiterable('abcdef', include_string=True) + True + +--- +yum +--- + +================================= =================== +yum kitchen replacement +--------------------------------- ------------------- +:func:`yum.i18n.dummy_wrapper` :meth:`kitchen.i18n.DummyTranslations.ugettext` [#y1]_ +:func:`yum.i18n.dummyP_wrapper` :meth:`kitchen.i18n.DummyTanslations.ungettext` [#y1]_ +:func:`yum.i18n.utf8_width` :func:`kitchen.text.display.textual_width` +:func:`yum.i18n.utf8_width_chop` :func:`kitchen.text.display.textual_width_chop` + and :func:`kitchen.text.display.textual_width` [#y2]_ [#y4]_ +:func:`yum.i18n.utf8_valid` :func:`kitchen.text.misc.byte_string_valid_encoding` +:func:`yum.i18n.utf8_text_wrap` :func:`kitchen.text.display.wrap` [#y3]_ +:func:`yum.i18n.utf8_text_fill` :func:`kitchen.text.display.fill` [#y3]_ +:func:`yum.i18n.to_unicode` :func:`kitchen.text.converters.to_unicode` [#y5]_ +:func:`yum.i18n.to_unicode_maybe` :func:`kitchen.text.converters.to_unicode` [#y5]_ +:func:`yum.i18n.to_utf8` :func:`kitchen.text.converters.to_bytes` [#y5]_ +:func:`yum.i18n.to_str` :func:`kitchen.text.converters.to_unicode` + or :func:`kitchen.text.converters.to_bytes` [#y6]_ +:func:`yum.i18n.str_eq` :func:`kitchen.text.misc.str_eq` +:func:`yum.misc.to_xml` :func:`kitchen.text.converters.unicode_to_xml` + or :func:`kitchen.text.converters.byte_string_to_xml` [#y7]_ +:func:`yum.i18n._` See: :ref:`yum-i18n-init` +:func:`yum.i18n.P_` See: :ref:`yum-i18n-init` +:func:`yum.i18n.exception2msg` :func:`kitchen.text.converters.exception_to_unicode` + or :func:`kitchen.text.converter.exception_to_bytes` [#y8]_ +================================= =================== + +.. [#y1] These yum methods provided fallback support for :mod:`gettext` + functions in case either ``gaftonmode`` was set or :mod:`gettext` failed + to return an object. In kitchen, we can use the + :class:`kitchen.i18n.DummyTranslations` object to fulfill that role. + Please see :ref:`yum-i18n-init` for more suggestions on how to do this. + +.. [#y2] The yum version of these functions returned a byte :class:`str`. The + kitchen version listed here returns a :class:`unicode` string. If you + need a byte :class:`str` simply call + :func:`kitchen.text.converters.to_bytes` on the result. + +.. [#y3] The yum version of these functions would return either a byte + :class:`str` or a :class:`unicode` string depending on what the input + value was. The kitchen version always returns :class:`unicode` strings. + +.. [#y4] :func:`yum.i18n.utf8_width_chop` performed two functions. It + returned the piece of the message that fit in a specified width and the + width of that message. In kitchen, you need to call two functions, one + for each action:: + + >>> # Old way + >>> utf8_width_chop(msg, 5) + (5, 'く ku') + >>> # New way + >>> from kitchen.text.display import textual_width, textual_width_chop + >>> (textual_width(msg), textual_width_chop(msg, 5)) + (5, u'く ku') + +.. [#y5] If the yum version of :func:`~yum.i18n.to_unicode` or + :func:`~yum.i18n.to_utf8` is given an object that is not a string, it + returns the object itself. :func:`kitchen.text.converters.to_unicode` and + :func:`kitchen.text.converters.to_bytes` default to returning the + ``simplerepr`` of the object instead. If you want the yum behaviour, set + the :attr:`nonstring` parameter to ``passthru``:: + + >>> from kitchen.text.converters import to_unicode + >>> to_unicode(5) + u'5' + >>> to_unicode(5, nonstring='passthru') + 5 + +.. [#y6] :func:`yum.i18n.to_str` could return either a byte :class:`str`. or + a :class:`unicode` string In kitchen you can get the same effect but you + get to choose whether you want a byte :class:`str` or a :class:`unicode` + string. Use :func:`~kitchen.text.converters.to_bytes` for :class:`str` + and :func:`~kitchen.text.converters.to_unicode` for :class:`unicode`. + +.. [#y7] :func:`yum.misc.to_xml` was buggy as written. I think the intention + was for you to be able to pass a byte :class:`str` or :class:`unicode` + string in and get out a byte :class:`str` that was valid to use in an xml + file. The two kitchen functions + :func:`~kitchen.text.converters.byte_string_to_xml` and + :func:`~kitchen.text.converters.unicode_to_xml` do that for each string + type. + +.. [#y8] When porting :func:`yum.i18n.exception2msg` to use kitchen, you + should setup two wrapper functions to aid in your port. They'll look like + this: + + .. code-block:: python + + from kitchen.text.converters import EXCEPTION_CONVERTERS, \ + BYTE_EXCEPTION_CONVERTERS, exception_to_unicode, \ + exception_to_bytes + def exception2umsg(e): + '''Return a unicode representation of an exception''' + c = [lambda e: e.value] + c.extend(EXCEPTION_CONVERTERS) + return exception_to_unicode(e, converters=c) + def exception2bmsg(e): + '''Return a utf8 encoded str representation of an exception''' + c = [lambda e: e.value] + c.extend(BYTE_EXCEPTION_CONVERTERS) + return exception_to_bytes(e, converters=c) + + The reason to define this wrapper is that many of the exceptions in yum + put the message in the :attr:`value` attribute of the :exc:`Exception` + instead of adding it to the :attr:`args` attribute. So the default + :data:`~kitchen.text.converters.EXCEPTION_CONVERTERS` don't know where to + find the message. The wrapper tells kitchen to check the :attr:`value` + attribute for the message. The reason to define two wrappers may be less + obvious. :func:`yum.i18n.exception2msg` can return a :class:`unicode` + string or a byte :class:`str` depending on a combination of what + attributes are present on the :exc:`Exception` and what locale the + function is being run in. By contrast, + :func:`kitchen.text.converters.exception_to_unicode` only returns + :class:`unicode` strings and + :func:`kitchen.text.converters.exception_to_bytes` only returns byte + :class:`str`. This is much safer as it keeps code that can only handle + :class:`unicode` or only handle byte :class:`str` correctly from getting + the wrong type when an input changes but it means you need to examine the + calling code when porting from :func:`yum.i18n.exception2msg` and use the + appropriate wrapper. + +.. _yum-i18n-init: + +Initializing Yum i18n +===================== + +Previously, yum had several pieces of code to initialize i18n. From the +toplevel of :file:`yum/i18n.py`:: + + try:. + ''' + Setup the yum translation domain and make _() and P_() translation wrappers + available. + using ugettext to make sure translated strings are in Unicode. + ''' + import gettext + t = gettext.translation('yum', fallback=True) + _ = t.ugettext + P_ = t.ungettext + except: + ''' + Something went wrong so we make a dummy _() wrapper there is just + returning the same text + ''' + _ = dummy_wrapper + P_ = dummyP_wrapper + +With kitchen, this can be changed to this:: + + from kitchen.i18n import easy_gettext_setup, DummyTranslations + try: + _, P_ = easy_gettext_setup('yum') + except: + translations = DummyTranslations() + _ = translations.ugettext + P_ = translations.ungettext + +.. note:: In :ref:`overcoming-frustration`, it is mentioned that for some + things (like exception messages), using the byte :class:`str` oriented + functions is more appropriate. If this is desired, the setup portion is + only a second call to :func:`kitchen.i18n.easy_gettext_setup`:: + + b_, bP_ = easy_gettext_setup('yum', use_unicode=False) + +The second place where i18n is setup is in :meth:`yum.YumBase._getConfig` in +:file:`yum/__init_.py` if ``gaftonmode`` is in effect:: + + if startupconf.gaftonmode: + global _ + _ = yum.i18n.dummy_wrapper + +This can be changed to:: + + if startupconf.gaftonmode: + global _ + _ = DummyTranslations().ugettext() diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000..17c9ac7 --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,19 @@ +================================ +Using kitchen to write good code +================================ + +Kitchen's functions won't automatically make you a better programmer. You +have to learn when and how to use them as well. This section of the +documentation is intended to show you some of the ways that you can apply +kitchen's functions to problems that may have arisen in your life. The goal +of this section is to give you enough information to understand what the +kitchen API can do for you and where in the :ref:`KitchenAPI` docs to look +for something that can help you with your next issue. Along the way, +you might pick up the knack for identifying issues with your code before you +publish it. And that *will* make you a better coder. + +.. toctree:: + :maxdepth: 2 + + unicode-frustrations + designing-unicode-apis diff --git a/docs/unicode-frustrations.rst b/docs/unicode-frustrations.rst new file mode 100644 index 0000000..c46e797 --- /dev/null +++ b/docs/unicode-frustrations.rst @@ -0,0 +1,571 @@ +.. _overcoming-frustration: + +========================================================== +Overcoming frustration: Correctly using unicode in python2 +========================================================== + +In python-2.x, there's two types that deal with text. + +1. :class:`str` is for strings of bytes. These are very similar in nature to + how strings are handled in C. +2. :class:`unicode` is for strings of unicode :term:`code points`. + +.. note:: + + **Just what the dickens is "Unicode"?** + + One mistake that people encountering this issue for the first time make is + confusing the :class:`unicode` type and the encodings of unicode stored in + the :class:`str` type. In python, the :class:`unicode` type stores an + abstract sequence of :term:`code points`. Each :term:`code point` + represents a :term:`grapheme`. By contrast, byte :class:`str` stores + a sequence of bytes which can then be mapped to a sequence of :term:`code + points`. Each unicode encoding (:term:`UTF-8`, UTF-7, UTF-16, UTF-32, + etc) maps different sequences of bytes to the unicode :term:`code points`. + + What does that mean to you as a programmer? When you're dealing with text + manipulations (finding the number of characters in a string or cutting + a string on word boundaries) you should be dealing with :class:`unicode` + strings as they abstract characters in a manner that's appropriate for + thinking of them as a sequence of letters that you will see on a page. + When dealing with I/O, reading to and from the disk, printing to + a terminal, sending something over a network link, etc, you should be dealing + with byte :class:`str` as those devices are going to need to deal with + concrete implementations of what bytes represent your abstract characters. + +In the python2 world many APIs use these two classes interchangably but there +are several important APIs where only one or the other will do the right +thing. When you give the wrong type of string to an API that wants the other +type, you may end up with an exception being raised (:exc:`UnicodeDecodeError` +or :exc:`UnicodeEncodeError`). However, these exceptions aren't always raised +because python implicitly converts between types... *sometimes*. + +----------------------------------- +Frustration #1: Inconsistent Errors +----------------------------------- + +Although converting when possible seems like the right thing to do, it's +actually the first source of frustration. A programmer can test out their +program with a string like: ``The quick brown fox jumped over the lazy dog`` +and not encounter any issues. But when they release their software into the +wild, someone enters the string: ``I sat down for coffee at the café`` and +suddenly an exception is thrown. The reason? The mechanism that converts +between the two types is only able to deal with :term:`ASCII` characters. +Once you throw non-:term:`ASCII` characters into your strings, you have to +start dealing with the conversion manually. + +So, if I manually convert everything to either byte :class:`str` or +:class:`unicode` strings, will I be okay? The answer is.... *sometimes*. + +--------------------------------- +Frustration #2: Inconsistent APIs +--------------------------------- + +The problem you run into when converting everything to byte :class:`str` or +:class:`unicode` strings is that you'll be using someone else's API quite +often (this includes the APIs in the |stdlib|_) and find that the API will only +accept byte :class:`str` or only accept :class:`unicode` strings. Or worse, +that the code will accept either when you're dealing with strings that consist +solely of :term:`ASCII` but throw an error when you give it a string that's +got non-:term:`ASCII` characters. When you encounter these APIs you first +need to identify which type will work better and then you have to convert your +values to the correct type for that code. Thus the programmer that wants to +proactively fix all unicode errors in their code needs to do two things: + +1. You must keep track of what type your sequences of text are. Does + ``my_sentence`` contain :class:`unicode` or :class:`str`? If you don't + know that then you're going to be in for a world of hurt. +2. Anytime you call a function you need to evaluate whether that function will + do the right thing with :class:`str` or :class:`unicode` values. Sending + the wrong value here will lead to a :exc:`UnicodeError` being thrown when + the string contains non-:term:`ASCII` characters. + +.. note:: + + There is one mitigating factor here. The python community has been + standardizing on using :class:`unicode` in all its APIs. Although there + are some APIs that you need to send byte :class:`str` to in order to be + safe, (including things as ubiquitous as :func:`print` as we'll see in the + next section), it's getting easier and easier to use :class:`unicode` + strings with most APIs. + +------------------------------------------------ +Frustration #3: Inconsistent treatment of output +------------------------------------------------ + +Alright, since the python community is moving to using :class:`unicode` +strings everywhere, we might as well convert everything to :class:`unicode` +strings and use that by default, right? Sounds good most of the time but +there's at least one huge caveat to be aware of. Anytime you output text to +the terminal or to a file, the text has to be converted into a byte +:class:`str`. Python will try to implicitly convert from :class:`unicode` to +byte :class:`str`... but it will throw an exception if the bytes are +non-:term:`ASCII`:: + + >>> string = unicode(raw_input(), 'utf8') + café + >>> log = open('/var/tmp/debug.log', 'w') + >>> log.write(string) + Traceback (most recent call last): + File "", line 1, in + UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 3: ordinal not in range(128) + +Okay, this is simple enough to solve: Just convert to a byte :class:`str` and +we're all set:: + + >>> string = unicode(raw_input(), 'utf8') + café + >>> string_for_output = string.encode('utf8', 'replace') + >>> log = open('/var/tmp/debug.log', 'w') + >>> log.write(string_for_output) + >>> + +So that was simple, right? Well... there's one gotcha that makes things a bit +harder to debug sometimes. When you attempt to write non-:term:`ASCII` +:class:`unicode` strings to a file-like object you get a traceback everytime. +But what happens when you use :func:`print`? The terminal is a file-like object +so it should raise an exception right? The answer to that is.... +*sometimes*: + +.. code-block:: pycon + + $ python + >>> print u'café' + café + +No exception. Okay, we're fine then? + +We are until someone does one of the following: + +* Runs the script in a different locale: + + .. code-block:: pycon + + $ LC_ALL=C python + >>> # Note: if you're using a good terminal program when running in the C locale + >>> # The terminal program will prevent you from entering non-ASCII characters + >>> # python will still recognize them if you use the codepoint instead: + >>> print u'caf\xe9' + Traceback (most recent call last): + File "", line 1, in + UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 3: ordinal not in range(128) + +* Redirects output to a file: + + .. code-block:: pycon + + $ cat test.py + #!/usr/bin/python -tt + # -*- coding: utf-8 -*- + print u'café' + $ ./test.py >t + Traceback (most recent call last): + File "./test.py", line 4, in + print u'café' + UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 3: ordinal not in range(128) + +Okay, the locale thing is a pain but understandable: the C locale doesn't +understand any characters outside of :term:`ASCII` so naturally attempting to +display those won't work. Now why does redirecting to a file cause problems? +It's because :func:`print` in python2 is treated specially. Whereas the other +file-like objects in python always convert to :term:`ASCII` unless you set +them up differently, using :func:`print` to output to the terminal will use +the user's locale to convert before sending the output to the terminal. When +:func:`print` is not outputting to the terminal (being redirected to a file, +for instance), :func:`print` decides that it doesn't know what locale to use +for that file and so it tries to convert to :term:`ASCII` instead. + +So what does this mean for you, as a programmer? Unless you have the luxury +of controlling how your users use your code, you should always, always, always +convert to a byte :class:`str` before outputting strings to the terminal or to +a file. Python even provides you with a facility to do just this. If you +know that every :class:`unicode` string you send to a particular file-like +object (for instance, :data:`~sys.stdout`) should be converted to a particular +encoding you can use a :class:`codecs.StreamWriter` object to convert from +a :class:`unicode` string into a byte :class:`str`. In particular, +:func:`codecs.getwriter` will return a :class:`~codecs.StreamWriter` class +that will help you to wrap a file-like object for output. Using our +:func:`print` example: + +.. code-block:: python + + $ cat test.py + #!/usr/bin/python -tt + # -*- coding: utf-8 -*- + import codecs + import sys + + UTF8Writer = codecs.getwriter('utf8') + sys.stdout = UTF8Writer(sys.stdout) + print u'café' + $ ./test.py >t + $ cat t + café + +----------------------------------------- +Frustrations #4 and #5 -- The other shoes +----------------------------------------- + +In English, there's a saying "waiting for the other shoe to drop". It means +that when one event (usually bad) happens, you come to expect another event +(usually worse) to come after. In this case we have two other shoes. + + +Frustration #4: Now it doesn't take byte strings?! +================================================== + +If you wrap :data:`sys.stdout` using :func:`codecs.getwriter` and think you +are now safe to print any variable without checking its type I am afraid +I must inform you that you're not paying enough attention to :term:`Murphy's +Law`. The :class:`~codecs.StreamWriter` that :func:`codecs.getwriter` +provides will take :class:`unicode` strings and transform them into byte +:class:`str` before they get to :data:`sys.stdout`. The problem is if you +give it something that's already a byte :class:`str` it tries to transform +that as well. To do that it tries to turn the byte :class:`str` you give it +into :class:`unicode` and then transform that back into a byte :class:`str`... +and since it uses the :term:`ASCII` codec to perform those conversions, +chances are that it'll blow up when making them:: + + >>> import codecs + >>> import sys + >>> UTF8Writer = codecs.getwriter('utf8') + >>> sys.stdout = UTF8Writer(sys.stdout) + >>> print 'café' + Traceback (most recent call last): + File "", line 1, in + File "/usr/lib64/python2.6/codecs.py", line 351, in write + data, consumed = self.encode(object, self.errors) + UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 3: ordinal not in range(128) + +To work around this, kitchen provides an alternate version of +:func:`codecs.getwriter` that can deal with both byte :class:`str` and +:class:`unicode` strings. Use :func:`kitchen.text.converters.getwriter` in +place of the :mod:`codecs` version like this:: + + >>> import sys + >>> from kitchen.text.converters import getwriter + >>> UTF8Writer = getwriter('utf8') + >>> sys.stdout = UTF8Writer(sys.stdout) + >>> print u'café' + café + >>> print 'café' + café + +Frustration #5: Exceptions +========================== + +Okay, so we've gotten ourselves this far. We convert everything to +:class:`unicode` strings. We're aware that we need to convert back into byte +:class:`str` before we write to the terminal. We've worked around the +inability of the standard :func:`~codecs.getwriter` to deal with both byte +:class:`str` and :class:`unicode` strings. Are we all set? Well, there's at +least one more gotcha: raising exceptions with a :class:`unicode` message. +Take a look: + +.. code-block:: pycon + + >>> class MyException(Exception): + >>> pass + >>> + >>> raise MyException(u'Cannot do this') + Traceback (most recent call last): + File "", line 1, in + __main__.MyException: Cannot do this + >>> raise MyException(u'Cannot do this while at a café') + Traceback (most recent call last): + File "", line 1, in + __main__.MyException: + >>> + +No, I didn't truncate that last line; raising exceptions really cannot handle +non-:term:`ASCII` characters in a :class:`unicode` string and will output an +exception without the message if the message contains them. What happens if +we try to use the handy dandy :func:`~kitchen.text.converters.getwriter` trick +to work around this? + +.. code-block:: pycon + + >>> import sys + >>> from kitchen.text.converters import getwriter + >>> sys.stderr = getwriter('utf8')(sys.stderr) + >>> raise MyException(u'Cannot do this') + Traceback (most recent call last): + File "", line 1, in + __main__.MyException: Cannot do this + >>> raise MyException(u'Cannot do this while at a café') + Traceback (most recent call last): + File "", line 1, in + __main__.MyException>>> + +Not only did this also fail, it even swallowed the trailing newline that's +normally there.... So how to make this work? Transform from :class:`unicode` +strings to byte :class:`str` manually before outputting:: + + >>> from kitchen.text.converters import to_bytes + >>> raise MyException(to_bytes(u'Cannot do this while at a café')) + Traceback (most recent call last): + File "", line 1, in + __main__.MyException: Cannot do this while at a café + >>> + +.. warning:: + + If you use :func:`codecs.getwriter` on :data:`sys.stderr`, you'll find + that raising an exception with a byte :class:`str` is broken by the + default :class:`~codecs.StreamWriter` as well. Don't do that or you'll + have no way to output non-:term:`ASCII` characters. If you want to use + a :class:`~codecs.StreamWriter` to encode other things on stderr while + still having working exceptions, use + :func:`kitchen.text.converters.getwriter`. + +------------------------------------------- +Frustration #6: Inconsistent APIs Part deux +------------------------------------------- +Sometimes you do everything right in your code but other people's code fails +you. With unicode issues this happens more often than we want. A glaring +example of this is when you get values back from a function that aren't +consistently :class:`unicode` string or byte :class:`str`. + +An example from the |stdlib|_ is :mod:`gettext`. The :mod:`gettext` functions +are used to help translate messages that you display to users in the users' +native languages. Since most languages contain letters outside of the +:term:`ASCII` range, the values that are returned contain unicode characters. +:mod:`gettext` provides you with :meth:`~gettext.GNUTranslations.ugettext` and +:meth:`~gettext.GNUTranslations.ungettext` to return these translations as +:class:`unicode` strings and :meth:`~gettext.GNUTranslations.gettext`, +:meth:`~gettext.GNUTranslations.ngettext`, +:meth:`~gettext.GNUTranslations.lgettext`, and +:meth:`~gettext.GNUTranslations.lngettext` to return them as encoded byte +:class:`str`. Unfortunately, even though they're documented to return only +one type of string or the other, the implementation has corner cases where the +wrong type can be returned. + +This means that even if you separate your :class:`unicode` string and byte +:class:`str` correctly before you pass your strings to a :mod:`gettext` +function, afterwards, you might have to check that you have the right sort of +string type again. + +.. note:: + + :mod:`kitchen.i18n` provides alternate gettext translation objects that + return only byte :class:`str` or only :class:`unicode` string. + +--------------- +A few solutions +--------------- + +Now that we've identified the issues, can we define a comprehensive strategy +for dealing with them? + +Convert text at the border +========================== + +If you get some piece of text from a library, read from a file, etc, turn it +into a :class:`unicode` string immediately. Since python is moving in the +direction of :class:`unicode` strings everywhere it's going to be easier to +work with :class:`unicode` strings within your code. + +If your code is heavily involved with using things that are bytes, you can do +the opposite and convert all text into byte :class:`str` at the border and +only convert to :class:`unicode` when you need it for passing to another +library or performing string operations on it. + +In either case, the important thing is to pick a default type for strings and +stick with it throughout your code. When you mix the types it becomes much +easier to operate on a string with a function that can only use the other type +by mistake. + +.. note:: In python3, the abstract unicode type becomes much more prominent. + The type named ``str`` is the equivalent of python2's :class:`unicode` and + python3's ``bytes`` type replaces python2's :class:`str`. Most APIs deal + in the unicode type of string with just some pieces that are low level + dealing with bytes. The implicit conversions between bytes and unicode + is removed and whenever you want to make the conversion you need to do so + explicitly. + +When the data needs to be treated as bytes (or unicode) use a naming convention +=============================================================================== + +Sometimes you're converting nearly all of your data to :class:`unicode` +strings but you have one or two values where you have to keep byte +:class:`str` around. This is often the case when you need to use the value +verbatim with some external resource. For instance, filenames or key values +in a database. When you do this, use a naming convention for the data you're +working with so you (and others reading your code later) don't get confused +about what's being stored in the value. + +If you need both a textual string to present to the user and a byte value for +an exact match, consider keeping both versions around. You can either use two +variables for this or a :class:`dict` whose key is the byte value. + +.. note:: You can use the naming convention used in kitchen as a guide for + implementing your own naming convention. It prefixes byte :class:`str` + variables of unknown encoding with ``b_`` and byte :class:`str` of known + encoding with the encoding name like: ``utf8_``. If the default was to + handle :class:`str` and only keep a few :class:`unicode` values, those + variables would be prefixed with ``u_``. + +When outputting data, convert back into bytes +============================================= + +When you go to send your data back outside of your program (to the filesystem, +over the network, displaying to the user, etc) turn the data back into a byte +:class:`str`. How you do this will depend on the expected output format of +the data. For displaying to the user, you can use the user's default encoding +using :func:`locale.getpreferredencoding`. For entering into a file, you're best +bet is to pick a single encoding and stick with it. + +.. warning:: + + When using the encoding that the user has set (for instance, using + :func:`locale.getpreferredencoding`, remember that they may have their + encoding set to something that can't display every single unicode + character. That means when you convert from :class:`unicode` to a byte + :class:`str` you need to decide what should happen if the byte value is + not valid in the user's encoding. For purposes of displaying messages to + the user, it's usually okay to use the ``replace`` encoding error handler + to replace the invalid characters with a question mark or other symbol + meaning the character couldn't be displayed. + +You can use :func:`kitchen.text.converters.getwriter` to do this automatically +for :data:`sys.stdout`. When creating exception messages be sure to convert +to bytes manually. + +When writing unittests, include non-ASCII values and both unicode and str type +============================================================================== + +Unless you know that a specific portion of your code will only deal with +:term:`ASCII`, be sure to include non-:term:`ASCII` values in your unittests. +Including a few characters from several different scripts is highly advised as +well because some code may have special cased accented roman characters but +not know how to handle characters used in Asian alphabets. + +Similarly, unless you know that that portion of your code will only be given +:class:`unicode` strings or only byte :class:`str` be sure to try variables +of both types in your unittests. When doing this, make sure that the +variables are also non-:term:`ASCII` as python's implicit conversion will mask +problems with pure :term:`ASCII` data. In many cases, it makes sense to check +what happens if byte :class:`str` and :class:`unicode` strings that won't +decode in the present locale are given. + +Be vigilant about spotting poor APIs +==================================== + +Make sure that the libraries you use return only :class:`unicode` strings or +byte :class:`str`. Unittests can help you spot issues here by running many +variations of data through your functions and checking that you're still +getting the types of string that you expect. + +Example: Putting this all together with kitchen +=============================================== + +The kitchen library provides a wide array of functions to help you deal with +byte :class:`str` and :class:`unicode` strings in your program. Here's +a short example that uses many kitchen functions to do its work:: + + #!/usr/bin/python -tt + # -*- coding: utf-8 -*- + import locale + import os + import sys + import unicodedata + + from kitchen.text.converters import getwriter, to_bytes, to_unicode + from kitchen.i18n import get_translation_object + + if __name__ == '__main__': + # Setup gettext driven translations but use the kitchen functions so + # we don't have the mismatched bytes-unicode issues. + translations = get_translation_object('example') + # We use _() for marking strings that we operate on as unicode + # This is pretty much everything + _ = translations.ugettext + # And b_() for marking strings that we operate on as bytes. + # This is limited to exceptions + b_ = translations.lgettext + + # Setup stdout + encoding = locale.getpreferredencoding() + Writer = getwriter(encoding) + sys.stdout = Writer(sys.stdout) + + # Load data. Format is filename\0description + # description should be utf-8 but filename can be any legal filename + # on the filesystem + # Sample datafile.txt: + # /etc/shells\x00Shells available on caf\xc3\xa9.lan + # /var/tmp/file\xff\x00File with non-utf8 data in the filename + # + # And to create /var/tmp/file\xff (under bash or zsh) do: + # echo 'Some data' > /var/tmp/file$'\377' + datafile = open('datafile.txt', 'r') + data = {} + for line in datafile: + # We're going to keep filename as bytes because we will need the + # exact bytes to access files on a POSIX operating system. + # description, we'll immediately transform into unicode type. + b_filename, description = line.split('\0', 1) + + # to_unicode defaults to decoding output from utf-8 and replacing + # any problematic bytes with the unicode replacement character + # We accept mangling of the description here knowing that our file + # format is supposed to use utf-8 in that field and that the + # description will only be displayed to the user, not used as + # a key value. + description = to_unicode(description, 'utf-8').strip() + data[b_filename] = description + datafile.close() + + # We're going to add a pair of extra fields onto our data to show the + # length of the description and the filesize. We put those between + # the filename and description because we haven't checked that the + # description is free of NULLs. + datafile = open('newdatafile.txt', 'w') + + # Name filename with a b_ prefix to denote byte string of unknown encoding + for b_filename in data: + # Since we have the byte representation of filename, we can read any + # filename + if os.access(b_filename, os.F_OK): + size = os.path.getsize(b_filename) + else: + size = 0 + # Because the description is unicode type, we know the number of + # characters corresponds to the length of the normalized unicode + # string. + length = len(unicodedata.normalize('NFC', description)) + + # Print a summary to the screen + # Note that we do not let implici type conversion from str to + # unicode transform b_filename into a unicode string. That might + # fail as python would use the ASCII filename. Instead we use + # to_unicode() to explictly transform in a way that we know will + # not traceback. + print _(u'filename: %s') % to_unicode(b_filename) + print _(u'file size: %s') % size + print _(u'desc length: %s') % length + print _(u'description: %s') % data[b_filename] + + # First combine the unicode portion + line = u'%s\0%s\0%s' % (size, length, data[b_filename]) + # Since the filenames are bytes, turn everything else to bytes before combining + # Turning into unicode first would be wrong as the bytes in b_filename + # might not convert + b_line = '%s\0%s\n' % (b_filename, to_bytes(line)) + + # Just to demonstrate that getwriter will pass bytes through fine + print b_('Wrote: %s') % b_line + datafile.write(b_line) + datafile.close() + + # And just to show how to properly deal with an exception. + # Note two things about this: + # 1) We use the b_() function to translate the string. This returns a + # byte string instead of a unicode string + # 2) We're using the b_() function returned by kitchen. If we had + # used the one from gettext we would need to convert the message to + # a byte str first + message = u'Demonstrate the proper way to raise exceptions. Sincerely, \u3068\u3057\u304a' + raise Exception(b_(message)) + +.. seealso:: :mod:`kitchen.text.converters` diff --git a/kitchen/__init__.py b/kitchen/__init__.py new file mode 100644 index 0000000..29c9d6d --- /dev/null +++ b/kitchen/__init__.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2011 Red Hat, Inc +# +# kitchen is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# kitchen is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +# +''' +Kitchen + +Aggregate of a bunch of unrelated but helpful python modules. +''' + +# Pylint disabled messages: +# :C0103: We need gettext aliases for both unicode strings and byte strings. +# The byte string one (b_) triggers this warning. +from kitchen import i18n +from kitchen import versioning + +(_, N_) = i18n.easy_gettext_setup('kitchen.core') +#pylint: disable-msg=C0103 +(b_, bN_) = i18n.easy_gettext_setup('kitchen.core', use_unicode=False) +#pylint: enable-msg=C0103 + +__version_info__ = ((1, 1, 1),) +__version__ = versioning.version_tuple_to_string(__version_info__) + +__all__ = ('exceptions', 'release',) diff --git a/kitchen/collections/__init__.py b/kitchen/collections/__init__.py new file mode 100644 index 0000000..35757fa --- /dev/null +++ b/kitchen/collections/__init__.py @@ -0,0 +1,9 @@ +from kitchen.versioning import version_tuple_to_string + +__version_info__ = ((1, 1, 0),) +__version__ = version_tuple_to_string(__version_info__) + +from kitchen.collections import strictdict +from kitchen.collections.strictdict import StrictDict + +__all__ = ('strictdict', 'StrictDict',) diff --git a/kitchen/collections/strictdict.py b/kitchen/collections/strictdict.py new file mode 100644 index 0000000..742dd7d --- /dev/null +++ b/kitchen/collections/strictdict.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc +# +# kitchen is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# kitchen is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +''' +---------- +StrictDict +---------- + +:class:`kitchen.collections.StrictDict` provides a dictionary that treats +:class:`str` and :class:`unicode` as distinct key values. +''' + +# Pylint disabled messages: +# :C0111: We're implementing the dict interface so just reference the dict +# documentation rather than having our own docstrings + +try: + # :E0611: Pylint false positive. We try to import from the stdlib but we + # have a fallback so this is okay. + #pylint:disable-msg=E0611 + from collections import defaultdict +except ImportError: + from kitchen.pycompat25.collections import defaultdict + +class StrictDict(defaultdict): + ''' + Map class that considers :class:`unicode` and :class:`str` different keys + + Ordinarily when you are dealing with a :class:`dict` keyed on strings you + want to have keys that have the same characters end up in the same bucket + even if one key is :class:`unicode` and the other is a byte :class:`str`. + The normal :class:`dict` type does this for :term:`ASCII` characters (but + not for anything outside of the :term:`ASCII` range.) + + Sometimes, however, you want to keep the two string classes strictly + separate, for instance, if you're creating a single table that can map + from :class:`unicode` characters to :class:`str` characters and vice + versa. This class will help you do that by making all :class:`unicode` + keys evaluate to a different key than all :class:`str` keys. + + .. seealso:: + :class:`dict` + for documentation on this class's methods. This class implements + all the standard :class:`dict` methods. Its treatment of + :class:`unicode` and :class:`str` keys as separate is the only + difference. + + ''' + #pylint:disable-msg=C0111 + def __getitem__(self, key): + return defaultdict.__getitem__(self, (repr(key), key)) + + def __setitem__(self, key, value): + defaultdict.__setitem__(self, (repr(key), key), value) + + def __delitem__(self, key): + defaultdict.__delitem__(self, (repr(key), key)) + + def __iter__(self): + for i in defaultdict.__iter__(self): + yield i[1] + + iterkeys = __iter__ + + def keys(self): + return list(self.__iter__()) + + def __contains__(self, key): + return defaultdict.__contains__(self, (repr(key), key)) + +__all__ = ('StrictDict',) diff --git a/kitchen/exceptions.py b/kitchen/exceptions.py new file mode 100644 index 0000000..d46bf51 --- /dev/null +++ b/kitchen/exceptions.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +# +''' +----------------------- +Base kitchen exceptions +----------------------- + +Exception classes for kitchen and the root of the exception hierarchy for +all kitchen modules. +''' + +class KitchenError(Exception): + '''Base exception class for any error thrown directly by kitchen. + ''' + pass + +__all__ = ('KitchenError',) diff --git a/kitchen/i18n/__init__.py b/kitchen/i18n/__init__.py new file mode 100644 index 0000000..29561a2 --- /dev/null +++ b/kitchen/i18n/__init__.py @@ -0,0 +1,827 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010-2011 Red Hat, Inc +# Copyright (c) 2009 Milos Komarcevic +# Copyright (c) 2008 Tim Lauridsen +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: James Antill +# Milos Komarcevic +# Toshio Kuratomi +# Tim Lauridsen +# Luke Macken +# Seth Vidal +# +# Portions of code taken from yum/i18n.py +# Portions of code adapted from |stdlib|_ gettext.py +''' +:term:`I18N` is an important piece of any modern program. Unfortunately, +setting up :term:`i18n` in your program is often a confusing process. The +functions provided here aim to make the programming side of that a little +easier. + +Most projects will be able to do something like this when they startup:: + + # myprogram/__init__.py: + + import os + import sys + + from kitchen.i18n import easy_gettext_setup + + _, N_ = easy_gettext_setup('myprogram', localedirs=( + os.path.join(os.path.realpath(os.path.dirname(__file__)), 'locale'), + os.path.join(sys.prefix, 'lib', 'locale') + )) + +Then, in other files that have strings that need translating:: + + # myprogram/commands.py: + + from myprogram import _, N_ + + def print_usage(): + print _(u"""available commands are: + --help Display help + --version Display version of this program + --bake-me-a-cake as fast as you can + """) + + def print_invitations(age): + print _('Please come to my party.') + print N_('I will be turning %(age)s year old', + 'I will be turning %(age)s years old', age) % {'age': age} + +See the documentation of :func:`easy_gettext_setup` and +:func:`get_translation_object` for more details. + + .. seealso:: + + :mod:`gettext` + for details of how the python gettext facilities work + `babel `_ + The babel module for in depth information on gettext, :term:`message + catalogs`, and translating your app. babel provides some nice + features for :term:`i18n` on top of :mod:`gettext` +''' +# Pylint disabled messages: +# :E1101: NewGNUTranslations is modeled as a replacement for GNUTranslations. +# That module invokes the _parse message to create some of its attributes. +# Pylint doesn't see those attributes being defined since it doesn't know +# when _parse() is called. We disable E1101 when accessing self._catalog +# and self.plural for this reason. +# :C0103: We're replicating the gettext API here so we need to use method and +# parameter names that mirror gettext. +# :C0111: We're replicating the gettext API here so for the gettext +# translation object methods we point people at the stdlib docs + +from kitchen.versioning import version_tuple_to_string + +__version_info__ = ((2, 1, 1),) +__version__ = version_tuple_to_string(__version_info__) + +import copy +from errno import ENOENT +import gettext +import itertools +import locale +import os +import sys + +# We use the _default_localedir definition in get_translation_object +try: + from gettext import _default_localedir as _DEFAULT_LOCALEDIR +except ImportError: + _DEFAULT_LOCALEDIR = os.path.join(sys.prefix, 'share', 'locale') + +from kitchen.text.converters import to_bytes, to_unicode +from kitchen.text.misc import byte_string_valid_encoding + +# We cache parts of the translation objects just like stdlib's gettext so that +# we don't reparse the message files and keep them in memory separately if the +# same catalog is opened twice. +_translations = {} + +class DummyTranslations(object, gettext.NullTranslations): + '''Safer version of :class:`gettext.NullTranslations` + + This Translations class doesn't translate the strings and is intended to + be used as a fallback when there were errors setting up a real + Translations object. It's safer than :class:`gettext.NullTranslations` in + its handling of byte :class:`str` vs :class:`unicode` strings. + + Unlike :class:`~gettext.NullTranslations`, this Translation class will + never throw a :exc:`~exceptions.UnicodeError`. The code that you have + around a call to :class:`DummyTranslations` might throw + a :exc:`~exceptions.UnicodeError` but at least that will be in code you + control and can fix. Also, unlike :class:`~gettext.NullTranslations` all + of this Translation object's methods guarantee to return byte :class:`str` + except for :meth:`ugettext` and :meth:`ungettext` which guarantee to + return :class:`unicode` strings. + + When byte :class:`str` are returned, the strings will be encoded according + to this algorithm: + + 1) If a fallback has been added, the fallback will be called first. + You'll need to consult the fallback to see whether it performs any + encoding changes. + 2) If a byte :class:`str` was given, the same byte :class:`str` will + be returned. + 3) If a :class:`unicode` string was given and :meth:`set_output_charset` + has been called then we encode the string using the + :attr:`output_charset` + 4) If a :class:`unicode` string was given and this is :meth:`gettext` or + :meth:`ngettext` and :attr:`_charset` was set output in that charset. + 5) If a :class:`unicode` string was given and this is :meth:`gettext` + or :meth:`ngettext` we encode it using 'utf-8'. + 6) If a :class:`unicode` string was given and this is :meth:`lgettext` + or :meth:`lngettext` we encode using the value of + :func:`locale.getpreferredencoding` + + For :meth:`ugettext` and :meth:`ungettext`, we go through the same set of + steps with the following differences: + + * We transform byte :class:`str` into :class:`unicode` strings for + these methods. + * The encoding used to decode the byte :class:`str` is taken from + :attr:`input_charset` if it's set, otherwise we decode using + :term:`UTF-8`. + + .. attribute:: input_charset + + is an extension to the |stdlib|_ :mod:`gettext` that specifies what + charset a message is encoded in when decoding a message to + :class:`unicode`. This is used for two purposes: + + 1) If the message string is a byte :class:`str`, this is used to decode + the string to a :class:`unicode` string before looking it up in the + :term:`message catalog`. + 2) In :meth:`~kitchen.i18n.DummyTranslations.ugettext` and + :meth:`~kitchen.i18n.DummyTranslations.ungettext` methods, if a byte + :class:`str` is given as the message and is untranslated this is used + as the encoding when decoding to :class:`unicode`. This is different + from :attr:`_charset` which may be set when a :term:`message catalog` + is loaded because :attr:`input_charset` is used to describe an encoding + used in a python source file while :attr:`_charset` describes the + encoding used in the :term:`message catalog` file. + + Any characters that aren't able to be transformed from a byte :class:`str` + to :class:`unicode` string or vice versa will be replaced with + a replacement character (ie: ``u'�'`` in unicode based encodings, ``'?'`` in other + :term:`ASCII` compatible encodings). + + .. seealso:: + + :class:`gettext.NullTranslations` + For information about what methods are available and what they do. + + .. versionchanged:: kitchen-1.1.0 ; API kitchen.i18n 2.1.0 + * Although we had adapted :meth:`gettext`, :meth:`ngettext`, + :meth:`lgettext`, and :meth:`lngettext` to always return byte + :class:`str`, we hadn't forced those byte :class:`str` to always be + in a specified charset. We now make sure that :meth:`gettext` and + :meth:`ngettext` return byte :class:`str` encoded using + :attr:`output_charset` if set, otherwise :attr:`charset` and if + neither of those, :term:`UTF-8`. With :meth:`lgettext` and + :meth:`lngettext` :attr:`output_charset` if set, otherwise + :func:`locale.getpreferredencoding`. + * Make setting :attr:`input_charset` and :attr:`output_charset` also + set those attributes on any fallback translation objects. + ''' + #pylint: disable-msg=C0103,C0111 + def __init__(self, fp=None): + gettext.NullTranslations.__init__(self, fp) + + # Python 2.3 compat + if not hasattr(self, '_output_charset'): + self._output_charset = None + + # Extension for making ugettext and ungettext more sane + # 'utf-8' is only a default here. Users can override. + self._input_charset = 'utf-8' + + def _set_input_charset(self, charset): + if self._fallback: + try: + self._fallback.input_charset = charset + except AttributeError: + pass + self._input_charset = charset + + def _get_input_charset(self): + return self._input_charset + + input_charset = property(_get_input_charset, _set_input_charset) + + def set_output_charset(self, charset): + '''Set the output charset + + This serves two purposes. The normal + :meth:`gettext.NullTranslations.set_output_charset` does not set the + output on fallback objects. On python-2.3, + :class:`gettext.NullTranslations` objects don't contain this method. + ''' + if self._fallback: + try: + self._fallback.set_output_charset(charset) + except AttributeError: + pass + try: + gettext.NullTranslations.set_output_charset(self, charset) + except AttributeError: + self._output_charset = charset + + if not hasattr(gettext.NullTranslations, 'output_charset'): + def output_charset(self): + '''Compatibility for python2.3 which doesn't have output_charset''' + return self._output_charset + + def _reencode_if_necessary(self, message, output_encoding): + '''Return a byte string that's valid in a specific charset. + + .. warning:: This method may mangle the message if the inpput encoding + is not known or the message isn't represntable in the chosen + output encoding. + ''' + valid = False + msg = None + try: + valid = byte_string_valid_encoding(message, output_encoding) + except TypeError: + # input was unicode, so it needs to be encoded + pass + + if valid: + return message + try: + # Decode to unicode so we can re-encode to desired encoding + msg = to_unicode(message, encoding=self.input_charset, + nonstring='strict') + except TypeError: + # Not a string; return an empty byte string + return '' + + # Make sure that we're returning a str of the desired encoding + return to_bytes(msg, encoding=output_encoding) + + def gettext(self, message): + # First use any fallback gettext objects. Since DummyTranslations + # doesn't do any translation on its own, this is a good first step. + if self._fallback: + try: + message = self._fallback.gettext(message) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or self._charset or + self.input_charset) + + return self._reencode_if_necessary(message, output_encoding) + + def ngettext(self, msgid1, msgid2, n): + # Default + if n == 1: + message = msgid1 + else: + message = msgid2 + + # The fallback method might return something different + if self._fallback: + try: + message = self._fallback.ngettext(msgid1, msgid2, n) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or self._charset or + self.input_charset) + + return self._reencode_if_necessary(message, output_encoding) + + def lgettext(self, message): + if self._fallback: + try: + message = self._fallback.lgettext(message) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: we'll do our own encoding next + # AttributeErrors happen on py2.3 where lgettext is not + # implemented + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or + locale.getpreferredencoding()) + + return self._reencode_if_necessary(message, output_encoding) + + def lngettext(self, msgid1, msgid2, n): + # Default + if n == 1: + message = msgid1 + else: + message = msgid2 + # Fallback method might have something different + if self._fallback: + try: + message = self._fallback.lngettext(msgid1, msgid2, n) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: we'll do our own encoding next + # AttributeError happens on py2.3 where lngettext is not + # implemented + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or + locale.getpreferredencoding()) + + return self._reencode_if_necessary(message, output_encoding) + + def ugettext(self, message): + if not isinstance(message, basestring): + return u'' + if self._fallback: + msg = to_unicode(message, encoding=self.input_charset) + try: + message = self._fallback.ugettext(msg) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own decoding later + pass + + # Make sure we're returning unicode + return to_unicode(message, encoding=self.input_charset) + + def ungettext(self, msgid1, msgid2, n): + # Default + if n == 1: + message = msgid1 + else: + message = msgid2 + # Fallback might override this + if self._fallback: + msgid1 = to_unicode(msgid1, encoding=self.input_charset) + msgid2 = to_unicode(msgid2, encoding=self.input_charset) + try: + message = self._fallback.ungettext(msgid1, msgid2, n) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own decoding later + pass + + # Make sure we're returning unicode + return to_unicode(message, encoding=self.input_charset, + nonstring='empty') + + +class NewGNUTranslations(DummyTranslations, gettext.GNUTranslations): + '''Safer version of :class:`gettext.GNUTranslations` + + :class:`gettext.GNUTranslations` suffers from two problems that this + class fixes. + + 1) :class:`gettext.GNUTranslations` can throw a + :exc:`~exceptions.UnicodeError` in + :meth:`gettext.GNUTranslations.ugettext` if the message being + translated has non-:term:`ASCII` characters and there is no translation + for it. + 2) :class:`gettext.GNUTranslations` can return byte :class:`str` from + :meth:`gettext.GNUTranslations.ugettext` and :class:`unicode` + strings from the other :meth:`~gettext.GNUTranslations.gettext` + methods if the message being translated is the wrong type + + When byte :class:`str` are returned, the strings will be encoded + according to this algorithm: + + 1) If a fallback has been added, the fallback will be called first. + You'll need to consult the fallback to see whether it performs any + encoding changes. + 2) If a byte :class:`str` was given, the same byte :class:`str` will + be returned. + 3) If a :class:`unicode` string was given and + :meth:`set_output_charset` has been called then we encode the + string using the :attr:`output_charset` + 4) If a :class:`unicode` string was given and this is :meth:`gettext` + or :meth:`ngettext` and a charset was detected when parsing the + :term:`message catalog`, output in that charset. + 5) If a :class:`unicode` string was given and this is :meth:`gettext` + or :meth:`ngettext` we encode it using :term:`UTF-8`. + 6) If a :class:`unicode` string was given and this is :meth:`lgettext` + or :meth:`lngettext` we encode using the value of + :func:`locale.getpreferredencoding` + + For :meth:`ugettext` and :meth:`ungettext`, we go through the same set of + steps with the following differences: + + * We transform byte :class:`str` into :class:`unicode` strings for these + methods. + * The encoding used to decode the byte :class:`str` is taken from + :attr:`input_charset` if it's set, otherwise we decode using + :term:`UTF-8` + + .. attribute:: input_charset + + an extension to the |stdlib|_ :mod:`gettext` that specifies what + charset a message is encoded in when decoding a message to + :class:`unicode`. This is used for two purposes: + + 1) If the message string is a byte :class:`str`, this is used to decode + the string to a :class:`unicode` string before looking it up in the + :term:`message catalog`. + 2) In :meth:`~kitchen.i18n.DummyTranslations.ugettext` and + :meth:`~kitchen.i18n.DummyTranslations.ungettext` methods, if a byte + :class:`str` is given as the message and is untranslated his is used as + the encoding when decoding to :class:`unicode`. This is different from + the :attr:`_charset` parameter that may be set when a :term:`message + catalog` is loaded because :attr:`input_charset` is used to describe an + encoding used in a python source file while :attr:`_charset` describes + the encoding used in the :term:`message catalog` file. + + Any characters that aren't able to be transformed from a byte + :class:`str` to :class:`unicode` string or vice versa will be replaced + with a replacement character (ie: ``u'�'`` in unicode based encodings, + ``'?'`` in other :term:`ASCII` compatible encodings). + + .. seealso:: + + :class:`gettext.GNUTranslations.gettext` + For information about what methods this class has and what they do + + .. versionchanged:: kitchen-1.1.0 ; API kitchen.i18n 2.1.0 + Although we had adapted :meth:`gettext`, :meth:`ngettext`, + :meth:`lgettext`, and :meth:`lngettext` to always return + byte :class:`str`, we hadn't forced those byte :class:`str` to always + be in a specified charset. We now make sure that :meth:`gettext` and + :meth:`ngettext` return byte :class:`str` encoded using + :attr:`output_charset` if set, otherwise :attr:`charset` and if + neither of those, :term:`UTF-8`. With :meth:`lgettext` and + :meth:`lngettext` :attr:`output_charset` if set, otherwise + :func:`locale.getpreferredencoding`. + ''' + #pylint: disable-msg=C0103,C0111 + def _parse(self, fp): + gettext.GNUTranslations._parse(self, fp) + + def gettext(self, message): + if not isinstance(message, basestring): + return '' + tmsg = message + u_message = to_unicode(message, encoding=self.input_charset) + try: + tmsg = self._catalog[u_message] #pylint:disable-msg=E1101 + except KeyError: + if self._fallback: + try: + tmsg = self._fallback.gettext(message) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or self._charset or + self.input_charset) + + return self._reencode_if_necessary(tmsg, output_encoding) + + def ngettext(self, msgid1, msgid2, n): + if n == 1: + tmsg = msgid1 + else: + tmsg = msgid2 + + if not isinstance(msgid1, basestring): + return '' + u_msgid1 = to_unicode(msgid1, encoding=self.input_charset) + try: + #pylint:disable-msg=E1101 + tmsg = self._catalog[(u_msgid1, self.plural(n))] + except KeyError: + if self._fallback: + try: + tmsg = self._fallback.ngettext(msgid1, msgid2, n) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or self._charset or + self.input_charset) + + return self._reencode_if_necessary(tmsg, output_encoding) + + def lgettext(self, message): + if not isinstance(message, basestring): + return '' + tmsg = message + u_message = to_unicode(message, encoding=self.input_charset) + try: + tmsg = self._catalog[u_message] #pylint:disable-msg=E1101 + except KeyError: + if self._fallback: + try: + tmsg = self._fallback.lgettext(message) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or + locale.getpreferredencoding()) + + return self._reencode_if_necessary(tmsg, output_encoding) + + def lngettext(self, msgid1, msgid2, n): + if n == 1: + tmsg = msgid1 + else: + tmsg = msgid2 + + if not isinstance(msgid1, basestring): + return '' + u_msgid1 = to_unicode(msgid1, encoding=self.input_charset) + try: + #pylint:disable-msg=E1101 + tmsg = self._catalog[(u_msgid1, self.plural(n))] + except KeyError: + if self._fallback: + try: + tmsg = self._fallback.ngettext(msgid1, msgid2, n) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Next decide what encoding to use for the strings we return + output_encoding = (self._output_charset or + locale.getpreferredencoding()) + + return self._reencode_if_necessary(tmsg, output_encoding) + + + def ugettext(self, message): + if not isinstance(message, basestring): + return u'' + message = to_unicode(message, encoding=self.input_charset) + try: + message = self._catalog[message] #pylint:disable-msg=E1101 + except KeyError: + if self._fallback: + try: + message = self._fallback.ugettext(message) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Make sure that we're returning unicode + return to_unicode(message, encoding=self.input_charset) + + def ungettext(self, msgid1, msgid2, n): + if n == 1: + tmsg = msgid1 + else: + tmsg = msgid2 + + if not isinstance(msgid1, basestring): + return u'' + u_msgid1 = to_unicode(msgid1, encoding=self.input_charset) + try: + #pylint:disable-msg=E1101 + tmsg = self._catalog[(u_msgid1, self.plural(n))] + except KeyError: + if self._fallback: + try: + tmsg = self._fallback.ungettext(msgid1, msgid2, n) + except (AttributeError, UnicodeError): + # Ignore UnicodeErrors: We'll do our own encoding next + pass + + # Make sure that we're returning unicode + return to_unicode(tmsg, encoding=self.input_charset, + nonstring='empty') + + +def get_translation_object(domain, localedirs=tuple(), languages=None, + class_=None, fallback=True, codeset=None): + '''Get a translation object bound to the :term:`message catalogs` + + :arg domain: Name of the message domain. This should be a unique name + that can be used to lookup the :term:`message catalog` for this app or + library. + :kwarg localedirs: Iterator of directories to look for + :term:`message catalogs` under. The directories are searched in order + for :term:`message catalogs`. For each of the directories searched, + we check for message catalogs in any language specified + in:attr:`languages`. The :term:`message catalogs` are used to create + the Translation object that we return. The Translation object will + attempt to lookup the msgid in the first catalog that we found. If + it's not in there, it will go through each subsequent catalog looking + for a match. For this reason, the order in which you specify the + :attr:`localedirs` may be important. If no :term:`message catalogs` + are found, either return a :class:`DummyTranslations` object or raise + an :exc:`IOError` depending on the value of :attr:`fallback`. + Rhe default localedir from :mod:`gettext` which is + :file:`os.path.join(sys.prefix, 'share', 'locale')` on Unix is + implicitly appended to the :attr:`localedirs`, making it the last + directory searched. + :kwarg languages: Iterator of language codes to check for + :term:`message catalogs`. If unspecified, the user's locale settings + will be used. + + .. seealso:: :func:`gettext.find` for information on what environment + variables are used. + + :kwarg class_: The class to use to extract translations from the + :term:`message catalogs`. Defaults to :class:`NewGNUTranslations`. + :kwarg fallback: If set to data:`False`, raise an :exc:`IOError` if no + :term:`message catalogs` are found. If :data:`True`, the default, + return a :class:`DummyTranslations` object. + :kwarg codeset: Set the character encoding to use when returning byte + :class:`str` objects. This is equivalent to calling + :meth:`~gettext.GNUTranslations.output_charset` on the Translations + object that is returned from this function. + :return: Translation object to get :mod:`gettext` methods from + + If you need more flexibility than :func:`easy_gettext_setup`, use this + function. It sets up a :mod:`gettext` Translation object and returns it + to you. Then you can access any of the methods of the object that you + need directly. For instance, if you specifically need to access + :func:`~gettext.GNUTranslations.lgettext`:: + + translations = get_translation_object('foo') + translations.lgettext('My Message') + + This function is similar to the |stdlib|_ :func:`gettext.translation` but + makes it better in two ways + + 1. It returns :class:`NewGNUTranslations` or :class:`DummyTranslations` + objects by default. These are superior to the + :class:`gettext.GNUTranslations` and :class:`gettext.NullTranslations` + objects because they are consistent in the string type they return and + they fix several issues that can causethe |stdlib|_ objects to throw + :exc:`UnicodeError`. + 2. This function takes multiple directories to search for + :term:`message catalogs`. + + The latter is important when setting up :mod:`gettext` in a portable + manner. There is not a common directory for translations across operating + systems so one needs to look in multiple directories for the translations. + :func:`get_translation_object` is able to handle that if you give it + a list of directories to search for catalogs:: + + translations = get_translation_object('foo', localedirs=( + os.path.join(os.path.realpath(os.path.dirname(__file__)), 'locale'), + os.path.join(sys.prefix, 'lib', 'locale'))) + + This will search for several different directories: + + 1. A directory named :file:`locale` in the same directory as the module + that called :func:`get_translation_object`, + 2. In :file:`/usr/lib/locale` + 3. In :file:`/usr/share/locale` (the fallback directory) + + This allows :mod:`gettext` to work on Windows and in development (where the + :term:`message catalogs` are typically in the toplevel module directory) + and also when installed under Linux (where the :term:`message catalogs` + are installed in :file:`/usr/share/locale`). You (or the system packager) + just need to install the :term:`message catalogs` in + :file:`/usr/share/locale` and remove the :file:`locale` directory from the + module to make this work. ie:: + + In development: + ~/foo # Toplevel module directory + ~/foo/__init__.py + ~/foo/locale # With message catalogs below here: + ~/foo/locale/es/LC_MESSAGES/foo.mo + + Installed on Linux: + /usr/lib/python2.7/site-packages/foo + /usr/lib/python2.7/site-packages/foo/__init__.py + /usr/share/locale/ # With message catalogs below here: + /usr/share/locale/es/LC_MESSAGES/foo.mo + + .. note:: + + This function will setup Translation objects that attempt to lookup + msgids in all of the found :term:`message catalogs`. This means if + you have several versions of the :term:`message catalogs` installed + in different directories that the function searches, you need to make + sure that :attr:`localedirs` specifies the directories so that newer + :term:`message catalogs` are searched first. It also means that if + a newer catalog does not contain a translation for a msgid but an + older one that's in :attr:`localedirs` does, the translation from that + older catalog will be returned. + + .. versionchanged:: kitchen-1.1.0 ; API kitchen.i18n 2.1.0 + Add more parameters to :func:`~kitchen.i18n.get_translation_object` so + it can more easily be used as a replacement for + :func:`gettext.translation`. Also change the way we use localedirs. + We cycle through them until we find a suitable locale file rather + than simply cycling through until we find a directory that exists. + The new code is based heavily on the |stdlib|_ + :func:`gettext.translation` function. + ''' + if not class_: + class_ = NewGNUTranslations + + mofiles = [] + for localedir in itertools.chain(localedirs, (_DEFAULT_LOCALEDIR,)): + mofiles.extend(gettext.find(domain, localedir, languages, all=1)) + if not mofiles: + if fallback: + return DummyTranslations() + raise IOError(ENOENT, 'No translation file found for domain', domain) + + # Accumulate a translation with fallbacks to all the other mofiles + stacked_translations = None + for mofile in mofiles: + full_path = os.path.abspath(mofile) + translation = _translations.get(full_path) + if not translation: + mofile_fh = open(full_path, 'rb') + try: + translation = _translations.setdefault(full_path, + class_(mofile_fh)) + finally: + mofile_fh.close() + + # Shallow copy the object so that the fallbacks and output charset can + # differ but the data we read from the mofile is shared. + translation = copy.copy(translation) + if codeset: + translation.set_output_charset(codeset) + if not stacked_translations: + stacked_translations = translation + else: + stacked_translations.add_fallback(translation) + + return stacked_translations + +def easy_gettext_setup(domain, localedirs=tuple(), use_unicode=True): + ''' Setup translation functions for an application + + :arg domain: Name of the message domain. This should be a unique name + that can be used to lookup the :term:`message catalog` for this app. + :kwarg localedirs: Iterator of directories to look for :term:`message + catalogs` under. The first directory to exist is used regardless of + whether messages for this domain are present. If none of the + directories exist, fallback on ``sys.prefix`` + :file:`/share/locale` + Default: No directories to search so we just use the fallback. + :kwarg use_unicode: If :data:`True` return the :mod:`gettext` functions + for :class:`unicode` strings else return the functions for byte + :class:`str` for the translations. Default is :data:`True`. + :return: tuple of the :mod:`gettext` function and :mod:`gettext` function + for plurals + + Setting up :mod:`gettext` can be a little tricky because of lack of + documentation. This function will setup :mod:`gettext` using the + `Class-based API + `_ for you. + For the simple case, you can use the default arguments and call it like + this:: + + _, N_ = easy_gettext_setup() + + This will get you two functions, :func:`_` and :func:`N_` that you can use + to mark strings in your code for translation. :func:`_` is used to mark + strings that don't need to worry about plural forms no matter what the + value of the variable is. :func:`N_` is used to mark strings that do need + to have a different form if a variable in the string is plural. + + .. seealso:: + + :doc:`api-i18n` + This module's documentation has examples of using :func:`_` and :func:`N_` + :func:`get_translation_object` + for information on how to use :attr:`localedirs` to get the + proper :term:`message catalogs` both when in development and when + installed to FHS compliant directories on Linux. + + .. note:: + + The gettext functions returned from this function should be superior + to the ones returned from :mod:`gettext`. The traits that make them + better are described in the :class:`DummyTranslations` and + :class:`NewGNUTranslations` documentation. + + .. versionchanged:: kitchen-0.2.4 ; API kitchen.i18n 2.0.0 + Changed :func:`~kitchen.i18n.easy_gettext_setup` to return the lgettext + functions instead of gettext functions when use_unicode=False. + ''' + translations = get_translation_object(domain, localedirs=localedirs) + if use_unicode: + return(translations.ugettext, translations.ungettext) + return(translations.lgettext, translations.lngettext) + +__all__ = ('DummyTranslations', 'NewGNUTranslations', 'easy_gettext_setup', + 'get_translation_object') diff --git a/kitchen/iterutils/__init__.py b/kitchen/iterutils/__init__.py new file mode 100644 index 0000000..d96d84d --- /dev/null +++ b/kitchen/iterutils/__init__.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +#. +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +#. +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +# Luke Macken +# +# Portions of code taken from python-fedora fedora/iterutils.py +''' +Functions to manipulate iterables + +.. versionadded:: Kitchen: 0.2.1a1 + +.. moduleauthor:: Toshio Kuratomi +.. moduleauthor:: Luke Macken +''' + +from kitchen.versioning import version_tuple_to_string + +__version_info__ = ((0, 0, 1),) +__version__ = version_tuple_to_string(__version_info__) + +def isiterable(obj, include_string=False): + '''Check whether an object is an iterable + + :arg obj: Object to test whether it is an iterable + :kwarg include_string: If :data:`True` and :attr:`obj` is a byte + :class:`str` or :class:`unicode` string this function will return + :data:`True`. If set to :data:`False`, byte :class:`str` and + :class:`unicode` strings will cause this function to return + :data:`False`. Default :data:`False`. + :returns: :data:`True` if :attr:`obj` is iterable, otherwise + :data:`False`. + ''' + if include_string or not isinstance(obj, basestring): + try: + iter(obj) + except TypeError: + return False + else: + return True + return False + +def iterate(obj, include_string=False): + '''Generator that can be used to iterate over anything + + :arg obj: The object to iterate over + :kwarg include_string: if :data:`True`, treat strings as iterables. + Otherwise treat them as a single scalar value. Default :data:`False` + + This function will create an iterator out of any scalar or iterable. It + is useful for making a value given to you an iterable before operating on it. + Iterables have their items returned. scalars are transformed into iterables. + A string is treated as a scalar value unless the :attr:`include_string` + parameter is set to :data:`True`. Example usage:: + + >>> list(iterate(None)) + [None] + >>> list(iterate([None])) + [None] + >>> list(iterate([1, 2, 3])) + [1, 2, 3] + >>> list(iterate(set([1, 2, 3]))) + [1, 2, 3] + >>> list(iterate(dict(a='1', b='2'))) + ['a', 'b'] + >>> list(iterate(1)) + [1] + >>> list(iterate(iter([1, 2, 3]))) + [1, 2, 3] + >>> list(iterate('abc')) + ['abc'] + >>> list(iterate('abc', include_string=True)) + ['a', 'b', 'c'] + ''' + if isiterable(obj, include_string=include_string): + for item in obj: + yield item + else: + yield obj + +__all__ = ('isiterable', 'iterate',) diff --git a/kitchen/pycompat24/__init__.py b/kitchen/pycompat24/__init__.py new file mode 100644 index 0000000..eb2ada8 --- /dev/null +++ b/kitchen/pycompat24/__init__.py @@ -0,0 +1,10 @@ +''' +The :mod:`kitchen.pycompat24` module contains implementations of functionality +introduced in python-2.4 for use on earlier versions of python. +''' +from kitchen.versioning import version_tuple_to_string + +__version_info__ = ((1, 1, 0),) +__version__ = version_tuple_to_string(__version_info__) + +__all__ = ('base64', 'sets', 'subprocess') diff --git a/kitchen/pycompat24/base64/__init__.py b/kitchen/pycompat24/base64/__init__.py new file mode 100644 index 0000000..ff9f75d --- /dev/null +++ b/kitchen/pycompat24/base64/__init__.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc +# +# This file is part of kitchen +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi + +''' +Implement the modern base64 interface. + +Python-2.4 and above have a new API for the base64 module. This is a backport +of that module for use on python-2.3. + +.. seealso:: + :mod:`base64` + for information about using the functions provided here. +''' +import sys + +# :W0401,W0614: The purpose of this module is to create a backport of base64 +# so we ignore these pylint warnings +#pylint:disable-msg=W0401,W0614 +if sys.version_info >= (2, 4): + from base64 import * +else: + from kitchen.pycompat24.base64._base64 import * + +__all__ = ( 'b16decode', 'b16encode', 'b32decode', 'b32encode', 'b64decode', + 'b64encode', 'decode', 'decodestring', 'encode', 'encodestring', + 'standard_b64decode', 'standard_b64encode', 'urlsafe_b64decode', + 'urlsafe_b64encode',) diff --git a/kitchen/pycompat24/base64/_base64.py b/kitchen/pycompat24/base64/_base64.py new file mode 100644 index 0000000..1278f1a --- /dev/null +++ b/kitchen/pycompat24/base64/_base64.py @@ -0,0 +1,363 @@ +#! /usr/bin/env python + +"""RFC 3548: Base16, Base32, Base64 Data Encodings""" + +# Modified 04-Oct-1995 by Jack Jansen to use binascii module +# Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support + +import re +import struct +import binascii + + +__all__ = [ + # Legacy interface exports traditional RFC 1521 Base64 encodings + 'encode', 'decode', 'encodestring', 'decodestring', + # Generalized interface for other encodings + 'b64encode', 'b64decode', 'b32encode', 'b32decode', + 'b16encode', 'b16decode', + # Standard Base64 encoding + 'standard_b64encode', 'standard_b64decode', + # Some common Base64 alternatives. As referenced by RFC 3458, see thread + # starting at: + # + # http://zgp.org/pipermail/p2p-hackers/2001-September/000316.html + 'urlsafe_b64encode', 'urlsafe_b64decode', + ] + +_translation = [chr(_x) for _x in range(256)] +EMPTYSTRING = '' + + +def _translate(s, altchars): + translation = _translation[:] + for k, v in altchars.items(): + translation[ord(k)] = v + return s.translate(''.join(translation)) + + + +# Base64 encoding/decoding uses binascii + +def b64encode(s, altchars=None): + """Encode a string using Base64. + + s is the string to encode. Optional altchars must be a string of at least + length 2 (additional characters are ignored) which specifies an + alternative alphabet for the '+' and '/' characters. This allows an + application to e.g. generate url or filesystem safe Base64 strings. + + The encoded string is returned. + """ + # Strip off the trailing newline + encoded = binascii.b2a_base64(s)[:-1] + if altchars is not None: + return _translate(encoded, {'+': altchars[0], '/': altchars[1]}) + return encoded + + +def b64decode(s, altchars=None): + """Decode a Base64 encoded string. + + s is the string to decode. Optional altchars must be a string of at least + length 2 (additional characters are ignored) which specifies the + alternative alphabet used instead of the '+' and '/' characters. + + The decoded string is returned. A TypeError is raised if s were + incorrectly padded or if there are non-alphabet characters present in the + string. + """ + if altchars is not None: + s = _translate(s, {altchars[0]: '+', altchars[1]: '/'}) + try: + return binascii.a2b_base64(s) + except binascii.Error, msg: + # Transform this exception for consistency + raise TypeError(msg) + + +def standard_b64encode(s): + """Encode a string using the standard Base64 alphabet. + + s is the string to encode. The encoded string is returned. + """ + return b64encode(s) + +def standard_b64decode(s): + """Decode a string encoded with the standard Base64 alphabet. + + s is the string to decode. The decoded string is returned. A TypeError + is raised if the string is incorrectly padded or if there are non-alphabet + characters present in the string. + """ + return b64decode(s) + +def urlsafe_b64encode(s): + """Encode a string using a url-safe Base64 alphabet. + + s is the string to encode. The encoded string is returned. The alphabet + uses '-' instead of '+' and '_' instead of '/'. + """ + return b64encode(s, '-_') + +def urlsafe_b64decode(s): + """Decode a string encoded with the standard Base64 alphabet. + + s is the string to decode. The decoded string is returned. A TypeError + is raised if the string is incorrectly padded or if there are non-alphabet + characters present in the string. + + The alphabet uses '-' instead of '+' and '_' instead of '/'. + """ + return b64decode(s, '-_') + + + +# Base32 encoding/decoding must be done in Python +_b32alphabet = { + 0: 'A', 9: 'J', 18: 'S', 27: '3', + 1: 'B', 10: 'K', 19: 'T', 28: '4', + 2: 'C', 11: 'L', 20: 'U', 29: '5', + 3: 'D', 12: 'M', 21: 'V', 30: '6', + 4: 'E', 13: 'N', 22: 'W', 31: '7', + 5: 'F', 14: 'O', 23: 'X', + 6: 'G', 15: 'P', 24: 'Y', + 7: 'H', 16: 'Q', 25: 'Z', + 8: 'I', 17: 'R', 26: '2', + } + +_b32tab = _b32alphabet.items() +_b32tab.sort() +_b32tab = [v for k, v in _b32tab] +_b32rev = dict([(v, long(k)) for k, v in _b32alphabet.items()]) + + +def b32encode(s): + """Encode a string using Base32. + + s is the string to encode. The encoded string is returned. + """ + parts = [] + quanta, leftover = divmod(len(s), 5) + # Pad the last quantum with zero bits if necessary + if leftover: + s += ('\0' * (5 - leftover)) + quanta += 1 + for i in range(quanta): + # c1 and c2 are 16 bits wide, c3 is 8 bits wide. The intent of this + # code is to process the 40 bits in units of 5 bits. So we take the 1 + # leftover bit of c1 and tack it onto c2. Then we take the 2 leftover + # bits of c2 and tack them onto c3. The shifts and masks are intended + # to give us values of exactly 5 bits in width. + c1, c2, c3 = struct.unpack('!HHB', s[i*5:(i+1)*5]) + c2 += (c1 & 1) << 16 # 17 bits wide + c3 += (c2 & 3) << 8 # 10 bits wide + parts.extend([_b32tab[c1 >> 11], # bits 1 - 5 + _b32tab[(c1 >> 6) & 0x1f], # bits 6 - 10 + _b32tab[(c1 >> 1) & 0x1f], # bits 11 - 15 + _b32tab[c2 >> 12], # bits 16 - 20 (1 - 5) + _b32tab[(c2 >> 7) & 0x1f], # bits 21 - 25 (6 - 10) + _b32tab[(c2 >> 2) & 0x1f], # bits 26 - 30 (11 - 15) + _b32tab[c3 >> 5], # bits 31 - 35 (1 - 5) + _b32tab[c3 & 0x1f], # bits 36 - 40 (1 - 5) + ]) + encoded = EMPTYSTRING.join(parts) + # Adjust for any leftover partial quanta + if leftover == 1: + return encoded[:-6] + '======' + elif leftover == 2: + return encoded[:-4] + '====' + elif leftover == 3: + return encoded[:-3] + '===' + elif leftover == 4: + return encoded[:-1] + '=' + return encoded + + +def b32decode(s, casefold=False, map01=None): + """Decode a Base32 encoded string. + + s is the string to decode. Optional casefold is a flag specifying whether + a lowercase alphabet is acceptable as input. For security purposes, the + default is False. + + RFC 3548 allows for optional mapping of the digit 0 (zero) to the letter O + (oh), and for optional mapping of the digit 1 (one) to either the letter I + (eye) or letter L (el). The optional argument map01 when not None, + specifies which letter the digit 1 should be mapped to (when map01 is not + None, the digit 0 is always mapped to the letter O). For security + purposes the default is None, so that 0 and 1 are not allowed in the + input. + + The decoded string is returned. A TypeError is raised if s were + incorrectly padded or if there are non-alphabet characters present in the + string. + """ + quanta, leftover = divmod(len(s), 8) + if leftover: + raise TypeError('Incorrect padding') + # Handle section 2.4 zero and one mapping. The flag map01 will be either + # False, or the character to map the digit 1 (one) to. It should be + # either L (el) or I (eye). + if map01: + s = _translate(s, {'0': 'O', '1': map01}) + if casefold: + s = s.upper() + # Strip off pad characters from the right. We need to count the pad + # characters because this will tell us how many null bytes to remove from + # the end of the decoded string. + padchars = 0 + mo = re.search('(?P[=]*)$', s) + if mo: + padchars = len(mo.group('pad')) + if padchars > 0: + s = s[:-padchars] + # Now decode the full quanta + parts = [] + acc = 0 + shift = 35 + for c in s: + val = _b32rev.get(c) + if val is None: + raise TypeError('Non-base32 digit found') + acc += _b32rev[c] << shift + shift -= 5 + if shift < 0: + parts.append(binascii.unhexlify('%010x' % acc)) + acc = 0 + shift = 35 + # Process the last, partial quanta + last = binascii.unhexlify('%010x' % acc) + if padchars == 0: + last = '' # No characters + elif padchars == 1: + last = last[:-1] + elif padchars == 3: + last = last[:-2] + elif padchars == 4: + last = last[:-3] + elif padchars == 6: + last = last[:-4] + else: + raise TypeError('Incorrect padding') + parts.append(last) + return EMPTYSTRING.join(parts) + + + +# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns +# lowercase. The RFC also recommends against accepting input case +# insensitively. +def b16encode(s): + """Encode a string using Base16. + + s is the string to encode. The encoded string is returned. + """ + return binascii.hexlify(s).upper() + + +def b16decode(s, casefold=False): + """Decode a Base16 encoded string. + + s is the string to decode. Optional casefold is a flag specifying whether + a lowercase alphabet is acceptable as input. For security purposes, the + default is False. + + The decoded string is returned. A TypeError is raised if s were + incorrectly padded or if there are non-alphabet characters present in the + string. + """ + if casefold: + s = s.upper() + if re.search('[^0-9A-F]', s): + raise TypeError('Non-base16 digit found') + return binascii.unhexlify(s) + + + +# Legacy interface. This code could be cleaned up since I don't believe +# binascii has any line length limitations. It just doesn't seem worth it +# though. + +MAXLINESIZE = 76 # Excluding the CRLF +MAXBINSIZE = (MAXLINESIZE//4)*3 + +def encode(input, output): + """Encode a file.""" + while True: + s = input.read(MAXBINSIZE) + if not s: + break + while len(s) < MAXBINSIZE: + ns = input.read(MAXBINSIZE-len(s)) + if not ns: + break + s += ns + line = binascii.b2a_base64(s) + output.write(line) + + +def decode(input, output): + """Decode a file.""" + while True: + line = input.readline() + if not line: + break + s = binascii.a2b_base64(line) + output.write(s) + + +def encodestring(s): + """Encode a string into multiple lines of base-64 data.""" + pieces = [] + for i in range(0, len(s), MAXBINSIZE): + chunk = s[i : i + MAXBINSIZE] + pieces.append(binascii.b2a_base64(chunk)) + return "".join(pieces) + + +def decodestring(s): + """Decode a string.""" + return binascii.a2b_base64(s) + + + +# Useable as a script... +def test(): + """Small test program""" + import sys, getopt + try: + opts, args = getopt.getopt(sys.argv[1:], 'deut') + except getopt.error, msg: + sys.stdout = sys.stderr + print msg + print """usage: %s [-d|-e|-u|-t] [file|-] + -d, -u: decode + -e: encode (default) + -t: encode and decode string 'Aladdin:open sesame'"""%sys.argv[0] + sys.exit(2) + func = encode + for o, a in opts: + if o == '-e': func = encode + if o == '-d': func = decode + if o == '-u': func = decode + if o == '-t': test1(); return + if args and args[0] != '-': + fh = open(args[0], 'rb') + try: + func(fh, sys.stdout) + finally: + fh.close() + else: + func(sys.stdin, sys.stdout) + + +def test1(): + s0 = "Aladdin:open sesame" + s1 = encodestring(s0) + s2 = decodestring(s1) + print s0, repr(s1), s2 + + +if __name__ == '__main__': + test() diff --git a/kitchen/pycompat24/sets/__init__.py b/kitchen/pycompat24/sets/__init__.py new file mode 100644 index 0000000..8091b9b --- /dev/null +++ b/kitchen/pycompat24/sets/__init__.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc +# +# This file is part of kitchen +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi + +''' +In python-2.4, a builtin :class:`set` type was added to python. This module +provides a function to emulate that on python-2.3 by using the :mod:`sets` +module. + +:func:`set` + Create a set. If running on python 2.4+ this is the :class:`set` + constructor. If using python-2.3, it's :class:`sets.Set`. + +:func:`frozenset` + Create a frozenset. If running on python2.4+ this is the + :class:`frozenset` constructor. If using python-2.3, it's + :class:`sets.ImmutableSet`. + +.. versionchanged:: 0.2.0 API: kitchen.pycompat24 1.0.0 + Added set and frozenset +''' +import __builtin__ + +# Setup set and frozenset on this module +# :W0622,C0103: The purpose of this module is to define set and frozenset if +# they aren't in builtins already so we disregard these pylint warnings +#pylint:disable-msg=W0622,C0103 +if not hasattr(__builtin__, 'set'): + import sets + set = sets.Set +else: + set = set + +if not hasattr(__builtin__, 'frozenset'): + import sets + frozenset = sets.ImmutableSet +else: + frozenset = frozenset +#pylint:enable-msg=W0622,C0103 + +def add_builtin_set(): + '''If there's no set builtin, us the :mod:`sets` module to make one + + This function makes sure that a :class:`set` and :class:`frozenset` type + are available in the :mod:`__builtin__` namespace. Since the function + checks whether :class:`set` and :class:`frozenset` are already present in + the :mod:`__builtin__` namespace and refuses to overwrite those if found, + it's safe to call this in multiple places and in scripts run under + python-2.4+, where a more efficient set implementation is already present + in the :mod:`__builtin__` namespace. + + However, since this function modifies :mod:`__builtin__` there's no need + to call it more than once so you likely want to do something like this + when your program loads:: + + myprogram/__init__.py: + + from kitchen.pycompat24 import sets + builtinset.add_builtin_set() + + You can then use :func:`set` and :func:`frozenset` anywhere in your code:: + + myprogram/compute.py: + + def math_students(algebra_student_list, geometry_student_list): + return set(algebra_student_list) union set(geometry_student_list) + ''' + if not hasattr(__builtin__, 'set'): + __builtin__.set = set + + if not hasattr(__builtin__, 'frozenset'): + __builtin__.frozenset = frozenset + +__all__ = ('add_builtin_set', 'set', 'frozenset') diff --git a/kitchen/pycompat24/subprocess.py b/kitchen/pycompat24/subprocess.py new file mode 100644 index 0000000..d91af54 --- /dev/null +++ b/kitchen/pycompat24/subprocess.py @@ -0,0 +1,5 @@ +# :W0401, W0611, W0614: Rather than have two versions of subprocess, we import +# the python2.7 version here as well +#pylint:disable-msg=W0401,W0611,W0614 +from kitchen.pycompat27.subprocess import * +from kitchen.pycompat27.subprocess import __all__ diff --git a/kitchen/pycompat25/__init__.py b/kitchen/pycompat25/__init__.py new file mode 100644 index 0000000..18a9ce4 --- /dev/null +++ b/kitchen/pycompat25/__init__.py @@ -0,0 +1,12 @@ +''' +The :mod:`kitchen.pycompat25` module contains implementations of functionality +introduced in python-2.5. +''' + +from kitchen.versioning import version_tuple_to_string + +__version_info__ = ((1, 0, 0),) +__version__ = version_tuple_to_string(__version_info__) + + +__all__ = ('collections',) diff --git a/kitchen/pycompat25/collections/__init__.py b/kitchen/pycompat25/collections/__init__.py new file mode 100644 index 0000000..77acc21 --- /dev/null +++ b/kitchen/pycompat25/collections/__init__.py @@ -0,0 +1,9 @@ +try: + #:E0611: deafultdict doesn't exist in python-2.4 or less but that's why we + # have it in a try: except:. So we can use our version if necessary. + #pylint:disable-msg=E0611 + from collections import defaultdict +except ImportError: + from kitchen.pycompat25.collections._defaultdict import defaultdict + +__all__ = ('defaultdict',) diff --git a/kitchen/pycompat25/collections/_defaultdict.py b/kitchen/pycompat25/collections/_defaultdict.py new file mode 100644 index 0000000..0560a3b --- /dev/null +++ b/kitchen/pycompat25/collections/_defaultdict.py @@ -0,0 +1,137 @@ +## +# Transcribed from http://code.activestate.com/recipes/523034/ on May 1, 2009 +# by Jef Spaleta This code provides an emulation for the defaultdict +# functionality introduced in python 2.5's collection module +# +# Changes from the original: +# * Change the return value from __reduce__ to use iteritems() to prevent +# a segfault when pickling. (Jef Spaleta) +# * Change how we setup the module to use collections.defaultdict by default +# (Toshio Kuratomi) +# +# Copyright (c) 2007 Justin Kirtland +# +# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +# -------------------------------------------- +# +# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"), +# and the Individual or Organization ("Licensee") accessing and otherwise +# using this software ("Python") in source or binary form and its +# associated documentation. +# +# 2. Subject to the terms and conditions of this License Agreement, PSF hereby +# grants Licensee a nonexclusive, royalty-free, world-wide license to +# reproduce, analyze, test, perform and/or display publicly, prepare +# derivative works, distribute, and otherwise use Python alone or in any +# derivative version, provided, however, that PSF's License Agreement and +# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004, +# 2005, 2006 Python Software Foundation; All Rights Reserved" are retained +# in Python alone or in any derivative version prepared by Licensee. +# +# 3. In the event Licensee prepares a derivative work that is based on or +# incorporates Python or any part thereof, and wants to make the derivative +# work available to others as provided herein, then Licensee hereby agrees +# to include in any such work a brief summary of the changes made to +# Python. +# +# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF +# MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF +# EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY +# REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY +# PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT INFRINGE ANY THIRD +# PARTY RIGHTS. +# +# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY +# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF +# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE +# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. +# +# 6. This License Agreement will automatically terminate upon a material +# breach of its terms and conditions. +# +# 7. Nothing in this License Agreement shall be deemed to create any +# relationship of agency, partnership, or joint venture between PSF and +# Licensee. This License Agreement does not grant permission to use PSF +# trademarks or trade name in a trademark sense to endorse or promote +# products or services of Licensee, or any third party. +# +# 8. By copying, installing or otherwise using Python, Licensee agrees to be +# bound by the terms and conditions of this License Agreement. + +''' +----------- +defaultdict +----------- + +This is a pure python implementation of defaultdict that is compatible with +the defaultdict class provided by python-2.5 and above. + +.. seealso:: + :class:`collections.defaultdict` + for documentation on this module +''' + +# Pylint disabled messages +# +# :C0103: We're defnining a compatible class name therefore we need to match +# the format of that name. + +import types + +from kitchen import b_ + +# :C0103, W0613: We're implementing the python-2.5 defaultdict API so +# we have to use the same names as python. +# :C0111: We point people at the stdlib API docs for defaultdict rather than +# reproduce it here. +#pylint:disable-msg=C0103,W0613,C0111 + +class defaultdict(dict): + def __init__(self, default_factory=None, *args, **kwargs): + if (default_factory is not None and + not hasattr(default_factory, '__call__')): + raise TypeError(b_('First argument must be callable')) + dict.__init__(self, *args, **kwargs) + self.default_factory = default_factory + + def __getitem__(self, key): + try: + return dict.__getitem__(self, key) + except KeyError: + return self.__missing__(key) + + def __missing__(self, key): + if self.default_factory is None: + raise KeyError(key) + self[key] = value = self.default_factory() + return value + + def __reduce__(self): + if self.default_factory is None: + args = tuple() + else: + args = self.default_factory, + return type(self), args, None, None, self.iteritems() + + def copy(self): + return self.__copy__() + + def __copy__(self): + return type(self)(self.default_factory, self) + + def __deepcopy__(self, memo): + import copy + return type(self)(self.default_factory, + copy.deepcopy(self.items())) + def __repr__(self): + # Note: Have to use "is not None" otherwise we get an infinite + # recursion + if isinstance(self.default_factory, types.MethodType) \ + and self.default_factory.im_self is not None \ + and issubclass(self.default_factory.im_class, defaultdict): + defrepr = ' +# +# Authors: +# Toshio Kuratomi + +''' +Implement the modern subprocess interface + +Python-2.5 and python-2.7 introduce new API features to subprocess. This is +a backport of that module for use on earlier python versions. + +.. seealso:: + :mod:`subprocess` + for information about using the functions provided here. +''' +import sys + +# :W0401,W0611,W0614: We're importing compatibility to the python-2.7 version +# of subprocess. +#pylint:disable-msg=W0401,W0611,W0614 +if sys.version_info >= (2, 7): + from subprocess import * + from subprocess import MAXFD, list2cmdline, mswindows + from subprocess import __all__ +else: + from kitchen.pycompat27.subprocess._subprocess import * + from kitchen.pycompat27.subprocess._subprocess import MAXFD, \ + list2cmdline, mswindows + from kitchen.pycompat27.subprocess._subprocess import __all__ diff --git a/kitchen/pycompat27/subprocess/_subprocess.py b/kitchen/pycompat27/subprocess/_subprocess.py new file mode 100644 index 0000000..d88e447 --- /dev/null +++ b/kitchen/pycompat27/subprocess/_subprocess.py @@ -0,0 +1,1538 @@ +# subprocess - Subprocesses with accessible I/O streams +# +# For more information about this module, see PEP 324. +# +# This module should remain compatible with Python 2.2, see PEP 291. +# +# Copyright (c) 2003-2005 by Peter Astrand +# +# Licensed to PSF under a Contributor Agreement. +# See http://www.python.org/2.4/license for licensing details. + +r"""subprocess - Subprocesses with accessible I/O streams + +This module allows you to spawn processes, connect to their +input/output/error pipes, and obtain their return codes. This module +intends to replace several other, older modules and functions, like: + +os.system +os.spawn* +os.popen* +popen2.* +commands.* + +Information about how the subprocess module can be used to replace these +modules and functions can be found below. + + + +Using the subprocess module +=========================== +This module defines one class called Popen: + +class Popen(args, bufsize=0, executable=None, + stdin=None, stdout=None, stderr=None, + preexec_fn=None, close_fds=False, shell=False, + cwd=None, env=None, universal_newlines=False, + startupinfo=None, creationflags=0): + + +Arguments are: + +args should be a string, or a sequence of program arguments. The +program to execute is normally the first item in the args sequence or +string, but can be explicitly set by using the executable argument. + +On UNIX, with shell=False (default): In this case, the Popen class +uses os.execvp() to execute the child program. args should normally +be a sequence. A string will be treated as a sequence with the string +as the only item (the program to execute). + +On UNIX, with shell=True: If args is a string, it specifies the +command string to execute through the shell. If args is a sequence, +the first item specifies the command string, and any additional items +will be treated as additional shell arguments. + +On Windows: the Popen class uses CreateProcess() to execute the child +program, which operates on strings. If args is a sequence, it will be +converted to a string using the list2cmdline method. Please note that +not all MS Windows applications interpret the command line the same +way: The list2cmdline is designed for applications using the same +rules as the MS C runtime. + +bufsize, if given, has the same meaning as the corresponding argument +to the built-in open() function: 0 means unbuffered, 1 means line +buffered, any other positive value means use a buffer of +(approximately) that size. A negative bufsize means to use the system +default, which usually means fully buffered. The default value for +bufsize is 0 (unbuffered). + +stdin, stdout and stderr specify the executed programs' standard +input, standard output and standard error file handles, respectively. +Valid values are PIPE, an existing file descriptor (a positive +integer), an existing file object, and None. PIPE indicates that a +new pipe to the child should be created. With None, no redirection +will occur; the child's file handles will be inherited from the +parent. Additionally, stderr can be STDOUT, which indicates that the +stderr data from the applications should be captured into the same +file handle as for stdout. + +If preexec_fn is set to a callable object, this object will be called +in the child process just before the child is executed. + +If close_fds is true, all file descriptors except 0, 1 and 2 will be +closed before the child process is executed. + +if shell is true, the specified command will be executed through the +shell. + +If cwd is not None, the current directory will be changed to cwd +before the child is executed. + +If env is not None, it defines the environment variables for the new +process. + +If universal_newlines is true, the file objects stdout and stderr are +opened as a text files, but lines may be terminated by any of '\n', +the Unix end-of-line convention, '\r', the Macintosh convention or +'\r\n', the Windows convention. All of these external representations +are seen as '\n' by the Python program. Note: This feature is only +available if Python is built with universal newline support (the +default). Also, the newlines attribute of the file objects stdout, +stdin and stderr are not updated by the communicate() method. + +The startupinfo and creationflags, if given, will be passed to the +underlying CreateProcess() function. They can specify things such as +appearance of the main window and priority for the new process. +(Windows only) + + +This module also defines some shortcut functions: + +call(*popenargs, **kwargs): + Run command with arguments. Wait for command to complete, then + return the returncode attribute. + + The arguments are the same as for the Popen constructor. Example: + + retcode = call(["ls", "-l"]) + +check_call(*popenargs, **kwargs): + Run command with arguments. Wait for command to complete. If the + exit code was zero then return, otherwise raise + CalledProcessError. The CalledProcessError object will have the + return code in the returncode attribute. + + The arguments are the same as for the Popen constructor. Example: + + check_call(["ls", "-l"]) + +check_output(*popenargs, **kwargs): + Run command with arguments and return its output as a byte string. + + If the exit code was non-zero it raises a CalledProcessError. The + CalledProcessError object will have the return code in the returncode + attribute and output in the output attribute. + + The arguments are the same as for the Popen constructor. Example: + + output = check_output(["ls", "-l", "/dev/null"]) + + +Exceptions +---------- +Exceptions raised in the child process, before the new program has +started to execute, will be re-raised in the parent. Additionally, +the exception object will have one extra attribute called +'child_traceback', which is a string containing traceback information +from the childs point of view. + +The most common exception raised is OSError. This occurs, for +example, when trying to execute a non-existent file. Applications +should prepare for OSErrors. + +A ValueError will be raised if Popen is called with invalid arguments. + +check_call() and check_output() will raise CalledProcessError, if the +called process returns a non-zero return code. + + +Security +-------- +Unlike some other popen functions, this implementation will never call +/bin/sh implicitly. This means that all characters, including shell +metacharacters, can safely be passed to child processes. + + +Popen objects +============= +Instances of the Popen class have the following methods: + +poll() + Check if child process has terminated. Returns returncode + attribute. + +wait() + Wait for child process to terminate. Returns returncode attribute. + +communicate(input=None) + Interact with process: Send data to stdin. Read data from stdout + and stderr, until end-of-file is reached. Wait for process to + terminate. The optional input argument should be a string to be + sent to the child process, or None, if no data should be sent to + the child. + + communicate() returns a tuple (stdout, stderr). + + Note: The data read is buffered in memory, so do not use this + method if the data size is large or unlimited. + +The following attributes are also available: + +stdin + If the stdin argument is PIPE, this attribute is a file object + that provides input to the child process. Otherwise, it is None. + +stdout + If the stdout argument is PIPE, this attribute is a file object + that provides output from the child process. Otherwise, it is + None. + +stderr + If the stderr argument is PIPE, this attribute is file object that + provides error output from the child process. Otherwise, it is + None. + +pid + The process ID of the child process. + +returncode + The child return code. A None value indicates that the process + hasn't terminated yet. A negative value -N indicates that the + child was terminated by signal N (UNIX only). + + +Replacing older functions with the subprocess module +==================================================== +In this section, "a ==> b" means that b can be used as a replacement +for a. + +Note: All functions in this section fail (more or less) silently if +the executed program cannot be found; this module raises an OSError +exception. + +In the following examples, we assume that the subprocess module is +imported with "from subprocess import *". + + +Replacing /bin/sh shell backquote +--------------------------------- +output=`mycmd myarg` +==> +output = Popen(["mycmd", "myarg"], stdout=PIPE).communicate()[0] + + +Replacing shell pipe line +------------------------- +output=`dmesg | grep hda` +==> +p1 = Popen(["dmesg"], stdout=PIPE) +p2 = Popen(["grep", "hda"], stdin=p1.stdout, stdout=PIPE) +output = p2.communicate()[0] + + +Replacing os.system() +--------------------- +sts = os.system("mycmd" + " myarg") +==> +p = Popen("mycmd" + " myarg", shell=True) +pid, sts = os.waitpid(p.pid, 0) + +Note: + +* Calling the program through the shell is usually not required. + +* It's easier to look at the returncode attribute than the + exitstatus. + +A more real-world example would look like this: + +try: + retcode = call("mycmd" + " myarg", shell=True) + if retcode < 0: + print >>sys.stderr, "Child was terminated by signal", -retcode + else: + print >>sys.stderr, "Child returned", retcode +except OSError, e: + print >>sys.stderr, "Execution failed:", e + + +Replacing os.spawn* +------------------- +P_NOWAIT example: + +pid = os.spawnlp(os.P_NOWAIT, "/bin/mycmd", "mycmd", "myarg") +==> +pid = Popen(["/bin/mycmd", "myarg"]).pid + + +P_WAIT example: + +retcode = os.spawnlp(os.P_WAIT, "/bin/mycmd", "mycmd", "myarg") +==> +retcode = call(["/bin/mycmd", "myarg"]) + + +Vector example: + +os.spawnvp(os.P_NOWAIT, path, args) +==> +Popen([path] + args[1:]) + + +Environment example: + +os.spawnlpe(os.P_NOWAIT, "/bin/mycmd", "mycmd", "myarg", env) +==> +Popen(["/bin/mycmd", "myarg"], env={"PATH": "/usr/bin"}) + + +Replacing os.popen* +------------------- +pipe = os.popen("cmd", mode='r', bufsize) +==> +pipe = Popen("cmd", shell=True, bufsize=bufsize, stdout=PIPE).stdout + +pipe = os.popen("cmd", mode='w', bufsize) +==> +pipe = Popen("cmd", shell=True, bufsize=bufsize, stdin=PIPE).stdin + + +(child_stdin, child_stdout) = os.popen2("cmd", mode, bufsize) +==> +p = Popen("cmd", shell=True, bufsize=bufsize, + stdin=PIPE, stdout=PIPE, close_fds=True) +(child_stdin, child_stdout) = (p.stdin, p.stdout) + + +(child_stdin, + child_stdout, + child_stderr) = os.popen3("cmd", mode, bufsize) +==> +p = Popen("cmd", shell=True, bufsize=bufsize, + stdin=PIPE, stdout=PIPE, stderr=PIPE, close_fds=True) +(child_stdin, + child_stdout, + child_stderr) = (p.stdin, p.stdout, p.stderr) + + +(child_stdin, child_stdout_and_stderr) = os.popen4("cmd", mode, + bufsize) +==> +p = Popen("cmd", shell=True, bufsize=bufsize, + stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) +(child_stdin, child_stdout_and_stderr) = (p.stdin, p.stdout) + +On Unix, os.popen2, os.popen3 and os.popen4 also accept a sequence as +the command to execute, in which case arguments will be passed +directly to the program without shell intervention. This usage can be +replaced as follows: + +(child_stdin, child_stdout) = os.popen2(["/bin/ls", "-l"], mode, + bufsize) +==> +p = Popen(["/bin/ls", "-l"], bufsize=bufsize, stdin=PIPE, stdout=PIPE) +(child_stdin, child_stdout) = (p.stdin, p.stdout) + +Return code handling translates as follows: + +pipe = os.popen("cmd", 'w') +... +rc = pipe.close() +if rc is not None and rc % 256: + print "There were some errors" +==> +process = Popen("cmd", 'w', shell=True, stdin=PIPE) +... +process.stdin.close() +if process.wait() != 0: + print "There were some errors" + + +Replacing popen2.* +------------------ +(child_stdout, child_stdin) = popen2.popen2("somestring", bufsize, mode) +==> +p = Popen(["somestring"], shell=True, bufsize=bufsize + stdin=PIPE, stdout=PIPE, close_fds=True) +(child_stdout, child_stdin) = (p.stdout, p.stdin) + +On Unix, popen2 also accepts a sequence as the command to execute, in +which case arguments will be passed directly to the program without +shell intervention. This usage can be replaced as follows: + +(child_stdout, child_stdin) = popen2.popen2(["mycmd", "myarg"], bufsize, + mode) +==> +p = Popen(["mycmd", "myarg"], bufsize=bufsize, + stdin=PIPE, stdout=PIPE, close_fds=True) +(child_stdout, child_stdin) = (p.stdout, p.stdin) + +The popen2.Popen3 and popen2.Popen4 basically works as subprocess.Popen, +except that: + +* subprocess.Popen raises an exception if the execution fails +* the capturestderr argument is replaced with the stderr argument. +* stdin=PIPE and stdout=PIPE must be specified. +* popen2 closes all filedescriptors by default, but you have to specify + close_fds=True with subprocess.Popen. +""" + +import sys +mswindows = (sys.platform == "win32") + +import os +import types +import traceback +import gc +import signal +import errno + +try: + set() +except: + from kitchen.pycompat24.sets import set + +# Exception classes used by this module. +class CalledProcessError(Exception): + """This exception is raised when a process run by check_call() or + check_output() returns a non-zero exit status. + The exit status will be stored in the returncode attribute; + check_output() will also store the output in the output attribute. + """ + def __init__(self, returncode, cmd, output=None): + self.returncode = returncode + self.cmd = cmd + self.output = output + def __str__(self): + return "Command '%s' returned non-zero exit status %d" % (self.cmd, self.returncode) + + +if mswindows: + import threading + import msvcrt + import _subprocess + class STARTUPINFO: + dwFlags = 0 + hStdInput = None + hStdOutput = None + hStdError = None + wShowWindow = 0 + class pywintypes: + error = IOError +else: + import select + _has_poll = hasattr(select, 'poll') + import fcntl + import pickle + + # When select or poll has indicated that the file is writable, + # we can write up to _PIPE_BUF bytes without risk of blocking. + # POSIX defines PIPE_BUF as >= 512. + _PIPE_BUF = getattr(select, 'PIPE_BUF', 512) + + +__all__ = ["Popen", "PIPE", "STDOUT", "call", "check_call", + "check_output", "CalledProcessError"] + +if mswindows: + from _subprocess import CREATE_NEW_CONSOLE, CREATE_NEW_PROCESS_GROUP, \ + STD_INPUT_HANDLE, STD_OUTPUT_HANDLE, \ + STD_ERROR_HANDLE, SW_HIDE, \ + STARTF_USESTDHANDLES, STARTF_USESHOWWINDOW + + __all__.extend(["CREATE_NEW_CONSOLE", "CREATE_NEW_PROCESS_GROUP", + "STD_INPUT_HANDLE", "STD_OUTPUT_HANDLE", + "STD_ERROR_HANDLE", "SW_HIDE", + "STARTF_USESTDHANDLES", "STARTF_USESHOWWINDOW"]) +try: + MAXFD = os.sysconf("SC_OPEN_MAX") +except: + MAXFD = 256 + +_active = [] + +def _cleanup(): + for inst in _active[:]: + res = inst._internal_poll(_deadstate=sys.maxint) + if res is not None: + try: + _active.remove(inst) + except ValueError: + # This can happen if two threads create a new Popen instance. + # It's harmless that it was already removed, so ignore. + pass + +PIPE = -1 +STDOUT = -2 + + +def _eintr_retry_call(func, *args): + while True: + try: + return func(*args) + except (OSError, IOError), e: + if e.errno == errno.EINTR: + continue + raise + + +def call(*popenargs, **kwargs): + """Run command with arguments. Wait for command to complete, then + return the returncode attribute. + + The arguments are the same as for the Popen constructor. Example: + + retcode = call(["ls", "-l"]) + """ + return Popen(*popenargs, **kwargs).wait() + + +def check_call(*popenargs, **kwargs): + """Run command with arguments. Wait for command to complete. If + the exit code was zero then return, otherwise raise + CalledProcessError. The CalledProcessError object will have the + return code in the returncode attribute. + + The arguments are the same as for the Popen constructor. Example: + + check_call(["ls", "-l"]) + """ + retcode = call(*popenargs, **kwargs) + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise CalledProcessError(retcode, cmd) + return 0 + + +def check_output(*popenargs, **kwargs): + r"""Run command with arguments and return its output as a byte string. + + If the exit code was non-zero it raises a CalledProcessError. The + CalledProcessError object will have the return code in the returncode + attribute and output in the output attribute. + + The arguments are the same as for the Popen constructor. Example: + + >>> check_output(["ls", "-l", "/dev/null"]) + 'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n' + + The stdout argument is not allowed as it is used internally. + To capture standard error in the result, use stderr=STDOUT. + + >>> check_output(["/bin/sh", "-c", + ... "ls -l non_existent_file ; exit 0"], + ... stderr=STDOUT) + 'ls: non_existent_file: No such file or directory\n' + """ + if 'stdout' in kwargs: + raise ValueError('stdout argument not allowed, it will be overridden.') + process = Popen(stdout=PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise CalledProcessError(retcode, cmd, output=output) + return output + + +def list2cmdline(seq): + """ + Translate a sequence of arguments into a command line + string, using the same rules as the MS C runtime: + + 1) Arguments are delimited by white space, which is either a + space or a tab. + + 2) A string surrounded by double quotation marks is + interpreted as a single argument, regardless of white space + contained within. A quoted string can be embedded in an + argument. + + 3) A double quotation mark preceded by a backslash is + interpreted as a literal double quotation mark. + + 4) Backslashes are interpreted literally, unless they + immediately precede a double quotation mark. + + 5) If backslashes immediately precede a double quotation mark, + every pair of backslashes is interpreted as a literal + backslash. If the number of backslashes is odd, the last + backslash escapes the next double quotation mark as + described in rule 3. + """ + + # See + # http://msdn.microsoft.com/en-us/library/17w5ykft.aspx + # or search http://msdn.microsoft.com for + # "Parsing C++ Command-Line Arguments" + result = [] + needquote = False + for arg in seq: + bs_buf = [] + + # Add a space to separate this argument from the others + if result: + result.append(' ') + + needquote = (" " in arg) or ("\t" in arg) or not arg + if needquote: + result.append('"') + + for c in arg: + if c == '\\': + # Don't know if we need to double yet. + bs_buf.append(c) + elif c == '"': + # Double backslashes. + result.append('\\' * len(bs_buf)*2) + bs_buf = [] + result.append('\\"') + else: + # Normal char + if bs_buf: + result.extend(bs_buf) + bs_buf = [] + result.append(c) + + # Add remaining backslashes, if any. + if bs_buf: + result.extend(bs_buf) + + if needquote: + result.extend(bs_buf) + result.append('"') + + return ''.join(result) + + +class Popen(object): + def __init__(self, args, bufsize=0, executable=None, + stdin=None, stdout=None, stderr=None, + preexec_fn=None, close_fds=False, shell=False, + cwd=None, env=None, universal_newlines=False, + startupinfo=None, creationflags=0): + """Create new Popen instance.""" + _cleanup() + + self._child_created = False + if not isinstance(bufsize, (int, long)): + raise TypeError("bufsize must be an integer") + + if mswindows: + if preexec_fn is not None: + raise ValueError("preexec_fn is not supported on Windows " + "platforms") + if close_fds and (stdin is not None or stdout is not None or + stderr is not None): + raise ValueError("close_fds is not supported on Windows " + "platforms if you redirect stdin/stdout/stderr") + else: + # POSIX + if startupinfo is not None: + raise ValueError("startupinfo is only supported on Windows " + "platforms") + if creationflags != 0: + raise ValueError("creationflags is only supported on Windows " + "platforms") + + self.stdin = None + self.stdout = None + self.stderr = None + self.pid = None + self.returncode = None + self.universal_newlines = universal_newlines + + # Input and output objects. The general principle is like + # this: + # + # Parent Child + # ------ ----- + # p2cwrite ---stdin---> p2cread + # c2pread <--stdout--- c2pwrite + # errread <--stderr--- errwrite + # + # On POSIX, the child objects are file descriptors. On + # Windows, these are Windows file handles. The parent objects + # are file descriptors on both platforms. The parent objects + # are None when not using PIPEs. The child objects are None + # when not redirecting. + + (p2cread, p2cwrite, + c2pread, c2pwrite, + errread, errwrite) = self._get_handles(stdin, stdout, stderr) + + self._execute_child(args, executable, preexec_fn, close_fds, + cwd, env, universal_newlines, + startupinfo, creationflags, shell, + p2cread, p2cwrite, + c2pread, c2pwrite, + errread, errwrite) + + if mswindows: + if p2cwrite is not None: + p2cwrite = msvcrt.open_osfhandle(p2cwrite.Detach(), 0) + if c2pread is not None: + c2pread = msvcrt.open_osfhandle(c2pread.Detach(), 0) + if errread is not None: + errread = msvcrt.open_osfhandle(errread.Detach(), 0) + + if p2cwrite is not None: + self.stdin = os.fdopen(p2cwrite, 'wb', bufsize) + if c2pread is not None: + if universal_newlines: + self.stdout = os.fdopen(c2pread, 'rU', bufsize) + else: + self.stdout = os.fdopen(c2pread, 'rb', bufsize) + if errread is not None: + if universal_newlines: + self.stderr = os.fdopen(errread, 'rU', bufsize) + else: + self.stderr = os.fdopen(errread, 'rb', bufsize) + + + def _translate_newlines(self, data): + data = data.replace("\r\n", "\n") + data = data.replace("\r", "\n") + return data + + + def __del__(self, _maxint=sys.maxint, _active=_active): + # If __init__ hasn't had a chance to execute (e.g. if it + # was passed an undeclared keyword argument), we don't + # have a _child_created attribute at all. + if not getattr(self, '_child_created', False): + # We didn't get to successfully create a child process. + return + # In case the child hasn't been waited on, check if it's done. + self._internal_poll(_deadstate=_maxint) + if self.returncode is None and _active is not None: + # Child is still running, keep us alive until we can wait on it. + _active.append(self) + + + def communicate(self, input=None): + """Interact with process: Send data to stdin. Read data from + stdout and stderr, until end-of-file is reached. Wait for + process to terminate. The optional input argument should be a + string to be sent to the child process, or None, if no data + should be sent to the child. + + communicate() returns a tuple (stdout, stderr).""" + + # Optimization: If we are only using one pipe, or no pipe at + # all, using select() or threads is unnecessary. + if [self.stdin, self.stdout, self.stderr].count(None) >= 2: + stdout = None + stderr = None + if self.stdin: + if input: + try: + self.stdin.write(input) + except IOError, e: + if e.errno != errno.EPIPE and e.errno != errno.EINVAL: + raise + self.stdin.close() + elif self.stdout: + stdout = _eintr_retry_call(self.stdout.read) + self.stdout.close() + elif self.stderr: + stderr = _eintr_retry_call(self.stderr.read) + self.stderr.close() + self.wait() + return (stdout, stderr) + + return self._communicate(input) + + + def poll(self): + return self._internal_poll() + + + if mswindows: + # + # Windows methods + # + def _get_handles(self, stdin, stdout, stderr): + """Construct and return tuple with IO objects: + p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite + """ + if stdin is None and stdout is None and stderr is None: + return (None, None, None, None, None, None) + + p2cread, p2cwrite = None, None + c2pread, c2pwrite = None, None + errread, errwrite = None, None + + if stdin is None: + p2cread = _subprocess.GetStdHandle(_subprocess.STD_INPUT_HANDLE) + if p2cread is None: + p2cread, _ = _subprocess.CreatePipe(None, 0) + elif stdin == PIPE: + p2cread, p2cwrite = _subprocess.CreatePipe(None, 0) + elif isinstance(stdin, int): + p2cread = msvcrt.get_osfhandle(stdin) + else: + # Assuming file-like object + p2cread = msvcrt.get_osfhandle(stdin.fileno()) + p2cread = self._make_inheritable(p2cread) + + if stdout is None: + c2pwrite = _subprocess.GetStdHandle(_subprocess.STD_OUTPUT_HANDLE) + if c2pwrite is None: + _, c2pwrite = _subprocess.CreatePipe(None, 0) + elif stdout == PIPE: + c2pread, c2pwrite = _subprocess.CreatePipe(None, 0) + elif isinstance(stdout, int): + c2pwrite = msvcrt.get_osfhandle(stdout) + else: + # Assuming file-like object + c2pwrite = msvcrt.get_osfhandle(stdout.fileno()) + c2pwrite = self._make_inheritable(c2pwrite) + + if stderr is None: + errwrite = _subprocess.GetStdHandle(_subprocess.STD_ERROR_HANDLE) + if errwrite is None: + _, errwrite = _subprocess.CreatePipe(None, 0) + elif stderr == PIPE: + errread, errwrite = _subprocess.CreatePipe(None, 0) + elif stderr == STDOUT: + errwrite = c2pwrite + elif isinstance(stderr, int): + errwrite = msvcrt.get_osfhandle(stderr) + else: + # Assuming file-like object + errwrite = msvcrt.get_osfhandle(stderr.fileno()) + errwrite = self._make_inheritable(errwrite) + + return (p2cread, p2cwrite, + c2pread, c2pwrite, + errread, errwrite) + + + def _make_inheritable(self, handle): + """Return a duplicate of handle, which is inheritable""" + return _subprocess.DuplicateHandle(_subprocess.GetCurrentProcess(), + handle, _subprocess.GetCurrentProcess(), 0, 1, + _subprocess.DUPLICATE_SAME_ACCESS) + + + def _find_w9xpopen(self): + """Find and return absolut path to w9xpopen.exe""" + w9xpopen = os.path.join( + os.path.dirname(_subprocess.GetModuleFileName(0)), + "w9xpopen.exe") + if not os.path.exists(w9xpopen): + # Eeek - file-not-found - possibly an embedding + # situation - see if we can locate it in sys.exec_prefix + w9xpopen = os.path.join(os.path.dirname(sys.exec_prefix), + "w9xpopen.exe") + if not os.path.exists(w9xpopen): + raise RuntimeError("Cannot locate w9xpopen.exe, which is " + "needed for Popen to work with your " + "shell or platform.") + return w9xpopen + + + def _execute_child(self, args, executable, preexec_fn, close_fds, + cwd, env, universal_newlines, + startupinfo, creationflags, shell, + p2cread, p2cwrite, + c2pread, c2pwrite, + errread, errwrite): + """Execute program (MS Windows version)""" + + if not isinstance(args, types.StringTypes): + args = list2cmdline(args) + + # Process startup details + if startupinfo is None: + startupinfo = STARTUPINFO() + if None not in (p2cread, c2pwrite, errwrite): + startupinfo.dwFlags |= _subprocess.STARTF_USESTDHANDLES + startupinfo.hStdInput = p2cread + startupinfo.hStdOutput = c2pwrite + startupinfo.hStdError = errwrite + + if shell: + startupinfo.dwFlags |= _subprocess.STARTF_USESHOWWINDOW + startupinfo.wShowWindow = _subprocess.SW_HIDE + comspec = os.environ.get("COMSPEC", "cmd.exe") + args = "%s /c %s" % (comspec, args) + if (_subprocess.GetVersion() >= 0x80000000L or + os.path.basename(comspec).lower() == "command.com"): + # Win9x, or using command.com on NT. We need to + # use the w9xpopen intermediate program. For more + # information, see KB Q150956 + # (http://web.archive.org/web/20011105084002/http://support.microsoft.com/support/kb/articles/Q150/9/56.asp) + w9xpopen = self._find_w9xpopen() + args = '"%s" %s' % (w9xpopen, args) + # Not passing CREATE_NEW_CONSOLE has been known to + # cause random failures on win9x. Specifically a + # dialog: "Your program accessed mem currently in + # use at xxx" and a hopeful warning about the + # stability of your system. Cost is Ctrl+C wont + # kill children. + creationflags |= _subprocess.CREATE_NEW_CONSOLE + + # Start the process + try: + try: + hp, ht, pid, tid = _subprocess.CreateProcess(executable, args, + # no special security + None, None, + int(not close_fds), + creationflags, + env, + cwd, + startupinfo) + except pywintypes.error, e: + # Translate pywintypes.error to WindowsError, which is + # a subclass of OSError. FIXME: We should really + # translate errno using _sys_errlist (or similar), but + # how can this be done from Python? + raise WindowsError(*e.args) + finally: + # Child is launched. Close the parent's copy of those pipe + # handles that only the child should have open. You need + # to make sure that no handles to the write end of the + # output pipe are maintained in this process or else the + # pipe will not close when the child process exits and the + # ReadFile will hang. + if p2cread is not None: + p2cread.Close() + if c2pwrite is not None: + c2pwrite.Close() + if errwrite is not None: + errwrite.Close() + + # Retain the process handle, but close the thread handle + self._child_created = True + self._handle = hp + self.pid = pid + ht.Close() + + def _internal_poll(self, _deadstate=None, + _WaitForSingleObject=_subprocess.WaitForSingleObject, + _WAIT_OBJECT_0=_subprocess.WAIT_OBJECT_0, + _GetExitCodeProcess=_subprocess.GetExitCodeProcess): + """Check if child process has terminated. Returns returncode + attribute. + + This method is called by __del__, so it can only refer to objects + in its local scope. + + """ + if self.returncode is None: + if _WaitForSingleObject(self._handle, 0) == _WAIT_OBJECT_0: + self.returncode = _GetExitCodeProcess(self._handle) + return self.returncode + + + def wait(self): + """Wait for child process to terminate. Returns returncode + attribute.""" + if self.returncode is None: + _subprocess.WaitForSingleObject(self._handle, + _subprocess.INFINITE) + self.returncode = _subprocess.GetExitCodeProcess(self._handle) + return self.returncode + + + def _readerthread(self, fh, buffer): + buffer.append(fh.read()) + + + def _communicate(self, input): + stdout = None # Return + stderr = None # Return + + if self.stdout: + stdout = [] + stdout_thread = threading.Thread(target=self._readerthread, + args=(self.stdout, stdout)) + stdout_thread.setDaemon(True) + stdout_thread.start() + if self.stderr: + stderr = [] + stderr_thread = threading.Thread(target=self._readerthread, + args=(self.stderr, stderr)) + stderr_thread.setDaemon(True) + stderr_thread.start() + + if self.stdin: + if input is not None: + try: + self.stdin.write(input) + except IOError, e: + if e.errno != errno.EPIPE: + raise + self.stdin.close() + + if self.stdout: + stdout_thread.join() + if self.stderr: + stderr_thread.join() + + # All data exchanged. Translate lists into strings. + if stdout is not None: + stdout = stdout[0] + if stderr is not None: + stderr = stderr[0] + + # Translate newlines, if requested. We cannot let the file + # object do the translation: It is based on stdio, which is + # impossible to combine with select (unless forcing no + # buffering). + if self.universal_newlines and hasattr(file, 'newlines'): + if stdout: + stdout = self._translate_newlines(stdout) + if stderr: + stderr = self._translate_newlines(stderr) + + self.wait() + return (stdout, stderr) + + def send_signal(self, sig): + """Send a signal to the process + """ + if sig == signal.SIGTERM: + self.terminate() + elif sig == signal.CTRL_C_EVENT: + os.kill(self.pid, signal.CTRL_C_EVENT) + elif sig == signal.CTRL_BREAK_EVENT: + os.kill(self.pid, signal.CTRL_BREAK_EVENT) + else: + raise ValueError("Unsupported signal: %s" % (sig,)) + + def terminate(self): + """Terminates the process + """ + _subprocess.TerminateProcess(self._handle, 1) + + kill = terminate + + else: + # + # POSIX methods + # + def _get_handles(self, stdin, stdout, stderr): + """Construct and return tuple with IO objects: + p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite + """ + p2cread, p2cwrite = None, None + c2pread, c2pwrite = None, None + errread, errwrite = None, None + + if stdin is None: + pass + elif stdin == PIPE: + p2cread, p2cwrite = self.pipe_cloexec() + elif isinstance(stdin, int): + p2cread = stdin + else: + # Assuming file-like object + p2cread = stdin.fileno() + + if stdout is None: + pass + elif stdout == PIPE: + c2pread, c2pwrite = self.pipe_cloexec() + elif isinstance(stdout, int): + c2pwrite = stdout + else: + # Assuming file-like object + c2pwrite = stdout.fileno() + + if stderr is None: + pass + elif stderr == PIPE: + errread, errwrite = self.pipe_cloexec() + elif stderr == STDOUT: + errwrite = c2pwrite + elif isinstance(stderr, int): + errwrite = stderr + else: + # Assuming file-like object + errwrite = stderr.fileno() + + return (p2cread, p2cwrite, + c2pread, c2pwrite, + errread, errwrite) + + + def _set_cloexec_flag(self, fd, cloexec=True): + try: + cloexec_flag = fcntl.FD_CLOEXEC + except AttributeError: + cloexec_flag = 1 + + old = fcntl.fcntl(fd, fcntl.F_GETFD) + if cloexec: + fcntl.fcntl(fd, fcntl.F_SETFD, old | cloexec_flag) + else: + fcntl.fcntl(fd, fcntl.F_SETFD, old & ~cloexec_flag) + + + def pipe_cloexec(self): + """Create a pipe with FDs set CLOEXEC.""" + # Pipes' FDs are set CLOEXEC by default because we don't want them + # to be inherited by other subprocesses: the CLOEXEC flag is removed + # from the child's FDs by _dup2(), between fork() and exec(). + # This is not atomic: we would need the pipe2() syscall for that. + r, w = os.pipe() + self._set_cloexec_flag(r) + self._set_cloexec_flag(w) + return r, w + + + def _close_fds(self, but): + if hasattr(os, 'closerange'): + os.closerange(3, but) + os.closerange(but + 1, MAXFD) + else: + for i in xrange(3, MAXFD): + if i == but: + continue + try: + os.close(i) + except: + pass + + + def _execute_child(self, args, executable, preexec_fn, close_fds, + cwd, env, universal_newlines, + startupinfo, creationflags, shell, + p2cread, p2cwrite, + c2pread, c2pwrite, + errread, errwrite): + """Execute program (POSIX version)""" + + if isinstance(args, types.StringTypes): + args = [args] + else: + args = list(args) + + if shell: + args = ["/bin/sh", "-c"] + args + if executable: + args[0] = executable + + if executable is None: + executable = args[0] + + # For transferring possible exec failure from child to parent + # The first char specifies the exception type: 0 means + # OSError, 1 means some other error. + errpipe_read, errpipe_write = self.pipe_cloexec() + try: + try: + gc_was_enabled = gc.isenabled() + # Disable gc to avoid bug where gc -> file_dealloc -> + # write to stderr -> hang. http://bugs.python.org/issue1336 + gc.disable() + try: + self.pid = os.fork() + except: + if gc_was_enabled: + gc.enable() + raise + self._child_created = True + if self.pid == 0: + # Child + try: + # Close parent's pipe ends + if p2cwrite is not None: + os.close(p2cwrite) + if c2pread is not None: + os.close(c2pread) + if errread is not None: + os.close(errread) + os.close(errpipe_read) + + # When duping fds, if there arises a situation + # where one of the fds is either 0, 1 or 2, it + # is possible that it is overwritten (#12607). + if c2pwrite == 0: + c2pwrite = os.dup(c2pwrite) + if errwrite == 0 or errwrite == 1: + errwrite = os.dup(errwrite) + + # Dup fds for child + def _dup2(a, b): + # dup2() removes the CLOEXEC flag but + # we must do it ourselves if dup2() + # would be a no-op (issue #10806). + if a == b: + self._set_cloexec_flag(a, False) + elif a is not None: + os.dup2(a, b) + _dup2(p2cread, 0) + _dup2(c2pwrite, 1) + _dup2(errwrite, 2) + + # Close pipe fds. Make sure we don't close the + # same fd more than once, or standard fds. + closed = set( (None,) ) + for fd in [p2cread, c2pwrite, errwrite]: + if fd not in closed and fd > 2: + os.close(fd) + closed.add(fd) + + # Close all other fds, if asked for + if close_fds: + self._close_fds(but=errpipe_write) + + if cwd is not None: + os.chdir(cwd) + + if preexec_fn: + preexec_fn() + + if env is None: + os.execvp(executable, args) + else: + os.execvpe(executable, args, env) + + except: + exc_type, exc_value, tb = sys.exc_info() + # Save the traceback and attach it to the exception object + exc_lines = traceback.format_exception(exc_type, + exc_value, + tb) + exc_value.child_traceback = ''.join(exc_lines) + os.write(errpipe_write, pickle.dumps(exc_value)) + + # This exitcode won't be reported to applications, so it + # really doesn't matter what we return. + os._exit(255) + + # Parent + if gc_was_enabled: + gc.enable() + finally: + # be sure the FD is closed no matter what + os.close(errpipe_write) + + if p2cread is not None and p2cwrite is not None: + os.close(p2cread) + if c2pwrite is not None and c2pread is not None: + os.close(c2pwrite) + if errwrite is not None and errread is not None: + os.close(errwrite) + + # Wait for exec to fail or succeed; possibly raising exception + # Exception limited to 1M + data = _eintr_retry_call(os.read, errpipe_read, 1048576) + finally: + # be sure the FD is closed no matter what + os.close(errpipe_read) + + if data != "": + try: + _eintr_retry_call(os.waitpid, self.pid, 0) + except OSError, e: + if e.errno != errno.ECHILD: + raise + child_exception = pickle.loads(data) + for fd in (p2cwrite, c2pread, errread): + if fd is not None: + os.close(fd) + raise child_exception + + + def _handle_exitstatus(self, sts, _WIFSIGNALED=os.WIFSIGNALED, + _WTERMSIG=os.WTERMSIG, _WIFEXITED=os.WIFEXITED, + _WEXITSTATUS=os.WEXITSTATUS): + # This method is called (indirectly) by __del__, so it cannot + # refer to anything outside of its local scope.""" + if _WIFSIGNALED(sts): + self.returncode = -_WTERMSIG(sts) + elif _WIFEXITED(sts): + self.returncode = _WEXITSTATUS(sts) + else: + # Should never happen + raise RuntimeError("Unknown child exit status!") + + + def _internal_poll(self, _deadstate=None, _waitpid=os.waitpid, + _WNOHANG=os.WNOHANG, _os_error=os.error): + """Check if child process has terminated. Returns returncode + attribute. + + This method is called by __del__, so it cannot reference anything + outside of the local scope (nor can any methods it calls). + + """ + if self.returncode is None: + try: + pid, sts = _waitpid(self.pid, _WNOHANG) + if pid == self.pid: + self._handle_exitstatus(sts) + except _os_error: + if _deadstate is not None: + self.returncode = _deadstate + return self.returncode + + + def wait(self): + """Wait for child process to terminate. Returns returncode + attribute.""" + if self.returncode is None: + try: + pid, sts = _eintr_retry_call(os.waitpid, self.pid, 0) + except OSError, e: + if e.errno != errno.ECHILD: + raise + # This happens if SIGCLD is set to be ignored or waiting + # for child processes has otherwise been disabled for our + # process. This child is dead, we can't get the status. + sts = 0 + self._handle_exitstatus(sts) + return self.returncode + + + def _communicate(self, input): + if self.stdin: + # Flush stdio buffer. This might block, if the user has + # been writing to .stdin in an uncontrolled fashion. + self.stdin.flush() + if not input: + self.stdin.close() + + if _has_poll: + stdout, stderr = self._communicate_with_poll(input) + else: + stdout, stderr = self._communicate_with_select(input) + + # All data exchanged. Translate lists into strings. + if stdout is not None: + stdout = ''.join(stdout) + if stderr is not None: + stderr = ''.join(stderr) + + # Translate newlines, if requested. We cannot let the file + # object do the translation: It is based on stdio, which is + # impossible to combine with select (unless forcing no + # buffering). + if self.universal_newlines and hasattr(file, 'newlines'): + if stdout: + stdout = self._translate_newlines(stdout) + if stderr: + stderr = self._translate_newlines(stderr) + + self.wait() + return (stdout, stderr) + + + def _communicate_with_poll(self, input): + stdout = None # Return + stderr = None # Return + fd2file = {} + fd2output = {} + + poller = select.poll() + def register_and_append(file_obj, eventmask): + poller.register(file_obj.fileno(), eventmask) + fd2file[file_obj.fileno()] = file_obj + + def close_unregister_and_remove(fd): + poller.unregister(fd) + fd2file[fd].close() + fd2file.pop(fd) + + if self.stdin and input: + register_and_append(self.stdin, select.POLLOUT) + + select_POLLIN_POLLPRI = select.POLLIN | select.POLLPRI + if self.stdout: + register_and_append(self.stdout, select_POLLIN_POLLPRI) + fd2output[self.stdout.fileno()] = stdout = [] + if self.stderr: + register_and_append(self.stderr, select_POLLIN_POLLPRI) + fd2output[self.stderr.fileno()] = stderr = [] + + input_offset = 0 + while fd2file: + try: + ready = poller.poll() + except select.error, e: + if e.args[0] == errno.EINTR: + continue + raise + + for fd, mode in ready: + if mode & select.POLLOUT: + chunk = input[input_offset : input_offset + _PIPE_BUF] + try: + input_offset += os.write(fd, chunk) + except OSError, e: + if e.errno == errno.EPIPE: + close_unregister_and_remove(fd) + else: + raise + else: + if input_offset >= len(input): + close_unregister_and_remove(fd) + elif mode & select_POLLIN_POLLPRI: + data = os.read(fd, 4096) + if not data: + close_unregister_and_remove(fd) + fd2output[fd].append(data) + else: + # Ignore hang up or errors. + close_unregister_and_remove(fd) + + return (stdout, stderr) + + + def _communicate_with_select(self, input): + read_set = [] + write_set = [] + stdout = None # Return + stderr = None # Return + + if self.stdin and input: + write_set.append(self.stdin) + if self.stdout: + read_set.append(self.stdout) + stdout = [] + if self.stderr: + read_set.append(self.stderr) + stderr = [] + + input_offset = 0 + while read_set or write_set: + try: + rlist, wlist, xlist = select.select(read_set, write_set, []) + except select.error, e: + if e.args[0] == errno.EINTR: + continue + raise + + if self.stdin in wlist: + chunk = input[input_offset : input_offset + _PIPE_BUF] + try: + bytes_written = os.write(self.stdin.fileno(), chunk) + except OSError, e: + if e.errno == errno.EPIPE: + self.stdin.close() + write_set.remove(self.stdin) + else: + raise + else: + input_offset += bytes_written + if input_offset >= len(input): + self.stdin.close() + write_set.remove(self.stdin) + + if self.stdout in rlist: + data = os.read(self.stdout.fileno(), 1024) + if data == "": + self.stdout.close() + read_set.remove(self.stdout) + stdout.append(data) + + if self.stderr in rlist: + data = os.read(self.stderr.fileno(), 1024) + if data == "": + self.stderr.close() + read_set.remove(self.stderr) + stderr.append(data) + + return (stdout, stderr) + + + def send_signal(self, sig): + """Send a signal to the process + """ + os.kill(self.pid, sig) + + def terminate(self): + """Terminate the process with SIGTERM + """ + self.send_signal(signal.SIGTERM) + + def kill(self): + """Kill the process with SIGKILL + """ + self.send_signal(signal.SIGKILL) + + +def _demo_posix(): + # + # Example 1: Simple redirection: Get process list + # + plist = Popen(["ps"], stdout=PIPE).communicate()[0] + print "Process list:" + print plist + + # + # Example 2: Change uid before executing child + # + if os.getuid() == 0: + p = Popen(["id"], preexec_fn=lambda: os.setuid(100)) + p.wait() + + # + # Example 3: Connecting several subprocesses + # + print "Looking for 'hda'..." + p1 = Popen(["dmesg"], stdout=PIPE) + p2 = Popen(["grep", "hda"], stdin=p1.stdout, stdout=PIPE) + print repr(p2.communicate()[0]) + + # + # Example 4: Catch execution error + # + print + print "Trying a weird file..." + try: + print Popen(["/this/path/does/not/exist"]).communicate() + except OSError, e: + if e.errno == errno.ENOENT: + print "The file didn't exist. I thought so..." + print "Child traceback:" + print e.child_traceback + else: + print "Error", e.errno + else: + print >>sys.stderr, "Gosh. No error." + + +def _demo_windows(): + # + # Example 1: Connecting several subprocesses + # + print "Looking for 'PROMPT' in set output..." + p1 = Popen("set", stdout=PIPE, shell=True) + p2 = Popen('find "PROMPT"', stdin=p1.stdout, stdout=PIPE) + print repr(p2.communicate()[0]) + + # + # Example 2: Simple execution of program + # + print "Executing calc..." + p = Popen("calc") + p.wait() + + +if __name__ == "__main__": + if mswindows: + _demo_windows() + else: + _demo_posix() diff --git a/kitchen/release.py b/kitchen/release.py new file mode 100644 index 0000000..0a30789 --- /dev/null +++ b/kitchen/release.py @@ -0,0 +1,35 @@ +''' +Information about this kitchen release. +''' + +from kitchen import _, __version__ + +NAME = 'kitchen' +VERSION = __version__ +DESCRIPTION = _('Kitchen contains a cornucopia of useful code') +LONG_DESCRIPTION = _(''' +We've all done it. In the process of writing a brand new application we've +discovered that we need a little bit of code that we've invented before. +Perhaps it's something to handle unicode text. Perhaps it's something to make +a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up being +a tiny bit of code that seems too small to worry about pushing into its own +module so it sits there, a part of your current project, waiting to be cut and +pasted into your next project. And the next. And the next. And since that +little bittybit of code proved so useful to you, it's highly likely that it +proved useful to someone else as well. Useful enough that they've written it +and copy and pasted it over and over into each of their new projects. + +Well, no longer! Kitchen aims to pull these small snippets of code into a few +python modules which you can import and use within your project. No more copy +and paste! Now you can let someone else maintain and release these small +snippets so that you can get on with your life. +''') +AUTHOR = 'Toshio Kuratomi, Seth Vidal, others' +EMAIL = 'toshio@fedoraproject.org' +COPYRIGHT = '2011 Red Hat, Inc. and others' +URL = 'https://fedorahosted.org/kitchen' +DOWNLOAD_URL = 'https://fedorahosted.org/releases/k/i/kitchen' +LICENSE = 'LGPLv2+' + +__all__ = ('NAME', 'VERSION', 'DESCRIPTION', 'LONG_DESCRIPTION', 'AUTHOR', + 'EMAIL', 'COPYRIGHT', 'URL', 'DOWNLOAD_URL', 'LICENSE') diff --git a/kitchen/text/__init__.py b/kitchen/text/__init__.py new file mode 100644 index 0000000..d72c034 --- /dev/null +++ b/kitchen/text/__init__.py @@ -0,0 +1,17 @@ +''' +------------ +Kitchen.text +------------ + +Kitchen.text contains functions for manipulating text in python. + +This includes things like converting between byte strings and unicode, +and displaying text on the screen. +''' + +from kitchen.versioning import version_tuple_to_string + +__version_info__ = ((2, 1, 1),) +__version__ = version_tuple_to_string(__version_info__) + +__all__ = ('converters', 'exceptions', 'misc',) diff --git a/kitchen/text/converters.py b/kitchen/text/converters.py new file mode 100644 index 0000000..8b5aac6 --- /dev/null +++ b/kitchen/text/converters.py @@ -0,0 +1,921 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2011 Red Hat, Inc. +# +# kitchen is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# kitchen is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +# Seth Vidal +# +# Portions of code taken from yum/i18n.py and +# python-fedora: fedora/textutils.py + +''' +Functions to handle conversion of byte :class:`str` and :class:`unicode` +strings. + +.. versionchanged:: kitchen 0.2a2 ; API kitchen.text 2.0.0 + Added :func:`~kitchen.text.converters.getwriter` + +.. versionchanged:: kitchen 0.2.2 ; API kitchen.text 2.1.0 + Added :func:`~kitchen.text.converters.exception_to_unicode`, + :func:`~kitchen.text.converters.exception_to_bytes`, + :data:`~kitchen.text.converters.EXCEPTION_CONVERTERS`, + and :data:`~kitchen.text.converters.BYTE_EXCEPTION_CONVERTERS` + +.. versionchanged:: kitchen 1.0.1 ; API kitchen.text 2.1.1 + Deprecated :data:`~kitchen.text.converters.BYTE_EXCEPTION_CONVERTERS` as + we've simplified :func:`~kitchen.text.converters.exception_to_unicode` and + :func:`~kitchen.text.converters.exception_to_bytes` to make it unnecessary + +''' +try: + from base64 import b64encode, b64decode +except ImportError: + from kitchen.pycompat24.base64 import b64encode, b64decode + +import codecs +import warnings +import xml.sax.saxutils + +# We need to access b_() for localizing our strings but we'll end up with +# a circular import if we import it directly. +import kitchen as k +from kitchen.pycompat24 import sets +sets.add_builtin_set() + +from kitchen.text.exceptions import ControlCharError, XmlEncodeError +from kitchen.text.misc import guess_encoding, html_entities_unescape, \ + process_control_chars + +#: Aliases for the utf-8 codec +_UTF8_ALIASES = frozenset(('utf-8', 'UTF-8', 'utf8', 'UTF8', 'utf_8', 'UTF_8', + 'utf', 'UTF', 'u8', 'U8')) +#: Aliases for the latin-1 codec +_LATIN1_ALIASES = frozenset(('latin-1', 'LATIN-1', 'latin1', 'LATIN1', + 'latin', 'LATIN', 'l1', 'L1', 'cp819', 'CP819', '8859', 'iso8859-1', + 'ISO8859-1', 'iso-8859-1', 'ISO-8859-1')) + +# EXCEPTION_CONVERTERS is defined below due to using to_unicode + +def to_unicode(obj, encoding='utf-8', errors='replace', nonstring=None, + non_string=None): + '''Convert an object into a :class:`unicode` string + + :arg obj: Object to convert to a :class:`unicode` string. This should + normally be a byte :class:`str` + :kwarg encoding: What encoding to try converting the byte :class:`str` as. + Defaults to :term:`utf-8` + :kwarg errors: If errors are found while decoding, perform this action. + Defaults to ``replace`` which replaces the invalid bytes with + a character that means the bytes were unable to be decoded. Other + values are the same as the error handling schemes in the `codec base + classes + `_. + For instance ``strict`` which raises an exception and ``ignore`` which + simply omits the non-decodable characters. + :kwarg nonstring: How to treat nonstring values. Possible values are: + + :simplerepr: Attempt to call the object's "simple representation" + method and return that value. Python-2.3+ has two methods that + try to return a simple representation: :meth:`object.__unicode__` + and :meth:`object.__str__`. We first try to get a usable value + from :meth:`object.__unicode__`. If that fails we try the same + with :meth:`object.__str__`. + :empty: Return an empty :class:`unicode` string + :strict: Raise a :exc:`TypeError` + :passthru: Return the object unchanged + :repr: Attempt to return a :class:`unicode` string of the repr of the + object + + Default is ``simplerepr`` + + :kwarg non_string: *Deprecated* Use :attr:`nonstring` instead + :raises TypeError: if :attr:`nonstring` is ``strict`` and + a non-:class:`basestring` object is passed in or if :attr:`nonstring` + is set to an unknown value + :raises UnicodeDecodeError: if :attr:`errors` is ``strict`` and + :attr:`obj` is not decodable using the given encoding + :returns: :class:`unicode` string or the original object depending on the + value of :attr:`nonstring`. + + Usually this should be used on a byte :class:`str` but it can take both + byte :class:`str` and :class:`unicode` strings intelligently. Nonstring + objects are handled in different ways depending on the setting of the + :attr:`nonstring` parameter. + + The default values of this function are set so as to always return + a :class:`unicode` string and never raise an error when converting from + a byte :class:`str` to a :class:`unicode` string. However, when you do + not pass validly encoded text (or a nonstring object), you may end up with + output that you don't expect. Be sure you understand the requirements of + your data, not just ignore errors by passing it through this function. + + .. versionchanged:: 0.2.1a2 + Deprecated :attr:`non_string` in favor of :attr:`nonstring` parameter and changed + default value to ``simplerepr`` + ''' + if isinstance(obj, basestring): + if isinstance(obj, unicode): + return obj + if encoding in _UTF8_ALIASES: + return unicode(obj, 'utf-8', errors) + if encoding in _LATIN1_ALIASES: + return unicode(obj, 'latin-1', errors) + return obj.decode(encoding, errors) + + if non_string: + warnings.warn(k.b_('non_string is a deprecated parameter of' + ' to_unicode(). Use nonstring instead'), DeprecationWarning, + stacklevel=2) + if not nonstring: + nonstring = non_string + + if not nonstring: + nonstring = 'simplerepr' + if nonstring == 'empty': + return u'' + elif nonstring == 'passthru': + return obj + elif nonstring == 'simplerepr': + try: + simple = obj.__unicode__() + except (AttributeError, UnicodeError): + simple = None + if not simple: + try: + simple = str(obj) + except UnicodeError: + try: + simple = obj.__str__() + except (UnicodeError, AttributeError): + simple = u'' + if not isinstance(simple, unicode): + return unicode(simple, encoding, errors) + return simple + elif nonstring in ('repr', 'strict'): + obj_repr = repr(obj) + if not isinstance(obj_repr, unicode): + obj_repr = unicode(obj_repr, encoding, errors) + if nonstring == 'repr': + return obj_repr + raise TypeError(k.b_('to_unicode was given "%(obj)s" which is neither' + ' a byte string (str) or a unicode string') % + {'obj': obj_repr.encode(encoding, 'replace')}) + + raise TypeError(k.b_('nonstring value, %(param)s, is not set to a valid' + ' action') % {'param': nonstring}) + +def to_bytes(obj, encoding='utf-8', errors='replace', nonstring=None, + non_string=None): + '''Convert an object into a byte :class:`str` + + :arg obj: Object to convert to a byte :class:`str`. This should normally + be a :class:`unicode` string. + :kwarg encoding: Encoding to use to convert the :class:`unicode` string + into a byte :class:`str`. Defaults to :term:`utf-8`. + :kwarg errors: If errors are found while encoding, perform this action. + Defaults to ``replace`` which replaces the invalid bytes with + a character that means the bytes were unable to be encoded. Other + values are the same as the error handling schemes in the `codec base + classes + `_. + For instance ``strict`` which raises an exception and ``ignore`` which + simply omits the non-encodable characters. + :kwarg nonstring: How to treat nonstring values. Possible values are: + + :simplerepr: Attempt to call the object's "simple representation" + method and return that value. Python-2.3+ has two methods that + try to return a simple representation: :meth:`object.__unicode__` + and :meth:`object.__str__`. We first try to get a usable value + from :meth:`object.__str__`. If that fails we try the same + with :meth:`object.__unicode__`. + :empty: Return an empty byte :class:`str` + :strict: Raise a :exc:`TypeError` + :passthru: Return the object unchanged + :repr: Attempt to return a byte :class:`str` of the :func:`repr` of the + object + + Default is ``simplerepr``. + + :kwarg non_string: *Deprecated* Use :attr:`nonstring` instead. + :raises TypeError: if :attr:`nonstring` is ``strict`` and + a non-:class:`basestring` object is passed in or if :attr:`nonstring` + is set to an unknown value. + :raises UnicodeEncodeError: if :attr:`errors` is ``strict`` and all of the + bytes of :attr:`obj` are unable to be encoded using :attr:`encoding`. + :returns: byte :class:`str` or the original object depending on the value + of :attr:`nonstring`. + + .. warning:: + + If you pass a byte :class:`str` into this function the byte + :class:`str` is returned unmodified. It is **not** re-encoded with + the specified :attr:`encoding`. The easiest way to achieve that is:: + + to_bytes(to_unicode(text), encoding='utf-8') + + The initial :func:`to_unicode` call will ensure text is + a :class:`unicode` string. Then, :func:`to_bytes` will turn that into + a byte :class:`str` with the specified encoding. + + Usually, this should be used on a :class:`unicode` string but it can take + either a byte :class:`str` or a :class:`unicode` string intelligently. + Nonstring objects are handled in different ways depending on the setting + of the :attr:`nonstring` parameter. + + The default values of this function are set so as to always return a byte + :class:`str` and never raise an error when converting from unicode to + bytes. However, when you do not pass an encoding that can validly encode + the object (or a non-string object), you may end up with output that you + don't expect. Be sure you understand the requirements of your data, not + just ignore errors by passing it through this function. + + .. versionchanged:: 0.2.1a2 + Deprecated :attr:`non_string` in favor of :attr:`nonstring` parameter + and changed default value to ``simplerepr`` + ''' + if isinstance(obj, basestring): + if isinstance(obj, str): + return obj + return obj.encode(encoding, errors) + if non_string: + warnings.warn(k.b_('non_string is a deprecated parameter of' + ' to_bytes(). Use nonstring instead'), DeprecationWarning, + stacklevel=2) + if not nonstring: + nonstring = non_string + if not nonstring: + nonstring = 'simplerepr' + + if nonstring == 'empty': + return '' + elif nonstring == 'passthru': + return obj + elif nonstring == 'simplerepr': + try: + simple = str(obj) + except UnicodeError: + try: + simple = obj.__str__() + except (AttributeError, UnicodeError): + simple = None + if not simple: + try: + simple = obj.__unicode__() + except (AttributeError, UnicodeError): + simple = '' + if isinstance(simple, unicode): + simple = simple.encode(encoding, 'replace') + return simple + elif nonstring in ('repr', 'strict'): + try: + obj_repr = obj.__repr__() + except (AttributeError, UnicodeError): + obj_repr = '' + if isinstance(obj_repr, unicode): + obj_repr = obj_repr.encode(encoding, errors) + else: + obj_repr = str(obj_repr) + if nonstring == 'repr': + return obj_repr + raise TypeError(k.b_('to_bytes was given "%(obj)s" which is neither' + ' a unicode string or a byte string (str)') % {'obj': obj_repr}) + + raise TypeError(k.b_('nonstring value, %(param)s, is not set to a valid' + ' action') % {'param': nonstring}) + +def getwriter(encoding): + '''Return a :class:`codecs.StreamWriter` that resists tracing back. + + :arg encoding: Encoding to use for transforming :class:`unicode` strings + into byte :class:`str`. + :rtype: :class:`codecs.StreamWriter` + :returns: :class:`~codecs.StreamWriter` that you can instantiate to wrap output + streams to automatically translate :class:`unicode` strings into :attr:`encoding`. + + This is a reimplemetation of :func:`codecs.getwriter` that returns + a :class:`~codecs.StreamWriter` that resists issuing tracebacks. The + :class:`~codecs.StreamWriter` that is returned uses + :func:`kitchen.text.converters.to_bytes` to convert :class:`unicode` + strings into byte :class:`str`. The departures from + :func:`codecs.getwriter` are: + + 1) The :class:`~codecs.StreamWriter` that is returned will take byte + :class:`str` as well as :class:`unicode` strings. Any byte + :class:`str` will be passed through unmodified. + 2) The default error handler for unknown bytes is to ``replace`` the bytes + with the unknown character (``?`` in most ascii-based encodings, ``�`` + in the utf encodings) whereas :func:`codecs.getwriter` defaults to + ``strict``. Like :class:`codecs.StreamWriter`, the returned + :class:`~codecs.StreamWriter` can have its error handler changed in + code by setting ``stream.errors = 'new_handler_name'`` + + Example usage:: + + $ LC_ALL=C python + >>> import sys + >>> from kitchen.text.converters import getwriter + >>> UTF8Writer = getwriter('utf-8') + >>> unwrapped_stdout = sys.stdout + >>> sys.stdout = UTF8Writer(unwrapped_stdout) + >>> print 'caf\\xc3\\xa9' + café + >>> print u'caf\\xe9' + café + >>> ASCIIWriter = getwriter('ascii') + >>> sys.stdout = ASCIIWriter(unwrapped_stdout) + >>> print 'caf\\xc3\\xa9' + café + >>> print u'caf\\xe9' + caf? + + .. seealso:: + + API docs for :class:`codecs.StreamWriter` and :func:`codecs.getwriter` + and `Print Fails `_ on the + python wiki. + + .. versionadded:: kitchen 0.2a2, API: kitchen.text 1.1.0 + ''' + class _StreamWriter(codecs.StreamWriter): + # :W0223: We don't need to implement all methods of StreamWriter. + # This is not the actual class that gets used but a replacement for + # the actual class. + # :C0111: We're implementing an API from the stdlib. Just point + # people at that documentation instead of writing docstrings here. + #pylint:disable-msg=W0223,C0111 + def __init__(self, stream, errors='replace'): + codecs.StreamWriter.__init__(self, stream, errors) + + def encode(self, msg, errors='replace'): + return (to_bytes(msg, encoding=self.encoding, errors=errors), + len(msg)) + + _StreamWriter.encoding = encoding + return _StreamWriter + +def to_utf8(obj, errors='replace', non_string='passthru'): + '''*Deprecated* + + Convert :class:`unicode` to an encoded :term:`utf-8` byte :class:`str`. + You should be using :func:`to_bytes` instead:: + + to_bytes(obj, encoding='utf-8', non_string='passthru') + ''' + warnings.warn(k.b_('kitchen.text.converters.to_utf8 is deprecated. Use' + ' kitchen.text.converters.to_bytes(obj, encoding="utf-8",' + ' nonstring="passthru" instead.'), DeprecationWarning, stacklevel=2) + return to_bytes(obj, encoding='utf-8', errors=errors, + nonstring=non_string) + +### str is also the type name for byte strings so it's not a good name for +### something that can return unicode strings +def to_str(obj): + '''*Deprecated* + + This function converts something to a byte :class:`str` if it isn't one. + It's used to call :func:`str` or :func:`unicode` on the object to get its + simple representation without danger of getting a :exc:`UnicodeError`. + You should be using :func:`to_unicode` or :func:`to_bytes` explicitly + instead. + + If you need :class:`unicode` strings:: + + to_unicode(obj, nonstring='simplerepr') + + If you need byte :class:`str`:: + + to_bytes(obj, nonstring='simplerepr') + ''' + warnings.warn(k.b_('to_str is deprecated. Use to_unicode or to_bytes' + ' instead. See the to_str docstring for' + ' porting information.'), + DeprecationWarning, stacklevel=2) + return to_bytes(obj, nonstring='simplerepr') + +# Exception message extraction functions +EXCEPTION_CONVERTERS = (lambda e: e.args[0], lambda e: e) +''' Tuple of functions to try to use to convert an exception into a string + representation. Its main use is to extract a string (:class:`unicode` or + :class:`str`) from an exception object in :func:`exception_to_unicode` and + :func:`exception_to_bytes`. The functions here will try the exception's + ``args[0]`` and the exception itself (roughly equivalent to + `str(exception)`) to extract the message. This is only a default and can + be easily overridden when calling those functions. There are several + reasons you might wish to do that. If you have exceptions where the best + string representing the exception is not returned by the default + functions, you can add another function to extract from a different + field:: + + from kitchen.text.converters import (EXCEPTION_CONVERTERS, + exception_to_unicode) + + class MyError(Exception): + def __init__(self, message): + self.value = message + + c = [lambda e: e.value] + c.extend(EXCEPTION_CONVERTERS) + try: + raise MyError('An Exception message') + except MyError, e: + print exception_to_unicode(e, converters=c) + + Another reason would be if you're converting to a byte :class:`str` and + you know the :class:`str` needs to be a non-:term:`utf-8` encoding. + :func:`exception_to_bytes` defaults to :term:`utf-8` but if you convert + into a byte :class:`str` explicitly using a converter then you can choose + a different encoding:: + + from kitchen.text.converters import (EXCEPTION_CONVERTERS, + exception_to_bytes, to_bytes) + c = [lambda e: to_bytes(e.args[0], encoding='euc_jp'), + lambda e: to_bytes(e, encoding='euc_jp')] + c.extend(EXCEPTION_CONVERTERS) + try: + do_something() + except Exception, e: + log = open('logfile.euc_jp', 'a') + log.write('%s\n' % exception_to_bytes(e, converters=c) + log.close() + + Each function in this list should take the exception as its sole argument + and return a string containing the message representing the exception. + The functions may return the message as a :byte class:`str`, + a :class:`unicode` string, or even an object if you trust the object to + return a decent string representation. The :func:`exception_to_unicode` + and :func:`exception_to_bytes` functions will make sure to convert the + string to the proper type before returning. + + .. versionadded:: 0.2.2 +''' + +BYTE_EXCEPTION_CONVERTERS = (lambda e: to_bytes(e.args[0]), to_bytes) +'''*Deprecated*: Use :data:`EXCEPTION_CONVERTERS` instead. + + Tuple of functions to try to use to convert an exception into a string + representation. This tuple is similar to the one in + :data:`EXCEPTION_CONVERTERS` but it's used with :func:`exception_to_bytes` + instead. Ideally, these functions should do their best to return the data + as a byte :class:`str` but the results will be run through + :func:`to_bytes` before being returned. + + .. versionadded:: 0.2.2 + .. versionchanged:: 1.0.1 + Deprecated as simplifications allow :data:`EXCEPTION_CONVERTERS` to + perform the same function. +''' + +def exception_to_unicode(exc, converters=EXCEPTION_CONVERTERS): + '''Convert an exception object into a unicode representation + + :arg exc: Exception object to convert + :kwarg converters: List of functions to use to convert the exception into + a string. See :data:`EXCEPTION_CONVERTERS` for the default value and + an example of adding other converters to the defaults. The functions + in the list are tried one at a time to see if they can extract + a string from the exception. The first one to do so without raising + an exception is used. + :returns: :class:`unicode` string representation of the exception. The + value extracted by the :attr:`converters` will be converted into + :class:`unicode` before being returned using the :term:`utf-8` + encoding. If you know you need to use an alternate encoding add + a function that does that to the list of functions in + :attr:`converters`) + + .. versionadded:: 0.2.2 + ''' + msg = u'' + for func in converters: + try: + msg = func(exc) + except: + pass + else: + break + return to_unicode(msg) + +def exception_to_bytes(exc, converters=EXCEPTION_CONVERTERS): + '''Convert an exception object into a str representation + + :arg exc: Exception object to convert + :kwarg converters: List of functions to use to convert the exception into + a string. See :data:`EXCEPTION_CONVERTERS` for the default value and + an example of adding other converters to the defaults. The functions + in the list are tried one at a time to see if they can extract + a string from the exception. The first one to do so without raising + an exception is used. + :returns: byte :class:`str` representation of the exception. The value + extracted by the :attr:`converters` will be converted into + :class:`str` before being returned using the :term:`utf-8` encoding. + If you know you need to use an alternate encoding add a function that + does that to the list of functions in :attr:`converters`) + + .. versionadded:: 0.2.2 + .. versionchanged:: 1.0.1 + Code simplification allowed us to switch to using + :data:`EXCEPTION_CONVERTERS` as the default value of + :attr:`converters`. + ''' + msg = '' + for func in converters: + try: + msg = func(exc) + except: + pass + else: + break + return to_bytes(msg) + +# +# XML Related Functions +# + +def unicode_to_xml(string, encoding='utf-8', attrib=False, + control_chars='replace'): + '''Take a :class:`unicode` string and turn it into a byte :class:`str` + suitable for xml + + :arg string: :class:`unicode` string to encode into an XML compatible byte + :class:`str` + :kwarg encoding: encoding to use for the returned byte :class:`str`. + Default is to encode to :term:`UTF-8`. If some of the characters in + :attr:`string` are not encodable in this encoding, the unknown + characters will be entered into the output string using xml character + references. + :kwarg attrib: If :data:`True`, quote the string for use in an xml + attribute. If :data:`False` (default), quote for use in an xml text + field. + :kwarg control_chars: :term:`control characters` are not allowed in XML + documents. When we encounter those we need to know what to do. Valid + options are: + + :replace: (default) Replace the control characters with ``?`` + :ignore: Remove the characters altogether from the output + :strict: Raise an :exc:`~kitchen.text.exceptions.XmlEncodeError` when + we encounter a :term:`control character` + + :raises kitchen.text.exceptions.XmlEncodeError: If :attr:`control_chars` + is set to ``strict`` and the string to be made suitable for output to + xml contains :term:`control characters` or if :attr:`string` is not + a :class:`unicode` string then we raise this exception. + :raises ValueError: If :attr:`control_chars` is set to something other than + ``replace``, ``ignore``, or ``strict``. + :rtype: byte :class:`str` + :returns: representation of the :class:`unicode` string as a valid XML + byte :class:`str` + + XML files consist mainly of text encoded using a particular charset. XML + also denies the use of certain bytes in the encoded text (example: ``ASCII + Null``). There are also special characters that must be escaped if they + are present in the input (example: ``<``). This function takes care of + all of those issues for you. + + There are a few different ways to use this function depending on your + needs. The simplest invocation is like this:: + + unicode_to_xml(u'String with non-ASCII characters: <"á と">') + + This will return the following to you, encoded in :term:`utf-8`:: + + 'String with non-ASCII characters: <"á と">' + + Pretty straightforward. Now, what if you need to encode your document in + something other than :term:`utf-8`? For instance, ``latin-1``? Let's + see:: + + unicode_to_xml(u'String with non-ASCII characters: <"á と">', encoding='latin-1') + 'String with non-ASCII characters: <"á と">' + + Because the ``と`` character is not available in the ``latin-1`` charset, + it is replaced with ``と`` in our output. This is an xml character + reference which represents the character at unicode codepoint ``12392``, the + ``と`` character. + + When you want to reverse this, use :func:`xml_to_unicode` which will turn + a byte :class:`str` into a :class:`unicode` string and replace the xml + character references with the unicode characters. + + XML also has the quirk of not allowing :term:`control characters` in its + output. The :attr:`control_chars` parameter allows us to specify what to + do with those. For use cases that don't need absolute character by + character fidelity (example: holding strings that will just be used for + display in a GUI app later), the default value of ``replace`` works well:: + + unicode_to_xml(u'String with disallowed control chars: \u0000\u0007') + 'String with disallowed control chars: ??' + + If you do need to be able to reproduce all of the characters at a later + date (examples: if the string is a key value in a database or a path on a + filesystem) you have many choices. Here are a few that rely on ``utf-7``, + a verbose encoding that encodes :term:`control characters` (as well as + non-:term:`ASCII` unicode values) to characters from within the + :term:`ASCII` printable characters. The good thing about doing this is + that the code is pretty simple. You just need to use ``utf-7`` both when + encoding the field for xml and when decoding it for use in your python + program:: + + unicode_to_xml(u'String with unicode: と and control char: \u0007', encoding='utf7') + 'String with unicode: +MGg and control char: +AAc-' + # [...] + xml_to_unicode('String with unicode: +MGg and control char: +AAc-', encoding='utf7') + u'String with unicode: と and control char: \u0007' + + As you can see, the ``utf-7`` encoding will transform even characters that + would be representable in :term:`utf-8`. This can be a drawback if you + want unicode characters in the file to be readable without being decoded + first. You can work around this with increased complexity in your + application code:: + + encoding = 'utf-8' + u_string = u'String with unicode: と and control char: \u0007' + try: + # First attempt to encode to utf8 + data = unicode_to_xml(u_string, encoding=encoding, errors='strict') + except XmlEncodeError: + # Fallback to utf-7 + encoding = 'utf-7' + data = unicode_to_xml(u_string, encoding=encoding, errors='strict') + write_tag('%s' % (encoding, data)) + # [...] + encoding = tag.attributes.encoding + u_string = xml_to_unicode(u_string, encoding=encoding) + + Using code similar to that, you can have some fields encoded using your + default encoding and fallback to ``utf-7`` if there are :term:`control + characters` present. + + .. note:: + + If your goal is to preserve the :term:`control characters` you cannot + save the entire file as ``utf-7`` and set the xml encoding parameter + to ``utf-7`` if your goal is to preserve the :term:`control + characters`. Because XML doesn't allow :term:`control characters`, + you have to encode those separate from any encoding work that the XML + parser itself knows about. + + .. seealso:: + + :func:`bytes_to_xml` + if you're dealing with bytes that are non-text or of an unknown + encoding that you must preserve on a byte for byte level. + :func:`guess_encoding_to_xml` + if you're dealing with strings in unknown encodings that you don't + need to save with char-for-char fidelity. + ''' + if not string: + # Small optimization + return '' + try: + process_control_chars(string, strategy=control_chars) + except TypeError: + raise XmlEncodeError(k.b_('unicode_to_xml must have a unicode type as' + ' the first argument. Use bytes_string_to_xml for byte' + ' strings.')) + except ValueError: + raise ValueError(k.b_('The control_chars argument to unicode_to_xml' + ' must be one of ignore, replace, or strict')) + except ControlCharError, exc: + raise XmlEncodeError(exc.args[0]) + + string = string.encode(encoding, 'xmlcharrefreplace') + + # Escape characters that have special meaning in xml + if attrib: + string = xml.sax.saxutils.escape(string, entities={'"':"""}) + else: + string = xml.sax.saxutils.escape(string) + return string + +def xml_to_unicode(byte_string, encoding='utf-8', errors='replace'): + '''Transform a byte :class:`str` from an xml file into a :class:`unicode` + string + + :arg byte_string: byte :class:`str` to decode + :kwarg encoding: encoding that the byte :class:`str` is in + :kwarg errors: What to do if not every character is valid in + :attr:`encoding`. See the :func:`to_unicode` documentation for legal + values. + :rtype: :class:`unicode` string + :returns: string decoded from :attr:`byte_string` + + This function attempts to reverse what :func:`unicode_to_xml` does. It + takes a byte :class:`str` (presumably read in from an xml file) and + expands all the html entities into unicode characters and decodes the byte + :class:`str` into a :class:`unicode` string. One thing it cannot do is + restore any :term:`control characters` that were removed prior to + inserting into the file. If you need to keep such characters you need to + use :func:`xml_to_bytes` and :func:`bytes_to_xml` or use on of the + strategies documented in :func:`unicode_to_xml` instead. + ''' + string = to_unicode(byte_string, encoding=encoding, errors=errors) + string = html_entities_unescape(string) + return string + +def byte_string_to_xml(byte_string, input_encoding='utf-8', errors='replace', + output_encoding='utf-8', attrib=False, control_chars='replace'): + '''Make sure a byte :class:`str` is validly encoded for xml output + + :arg byte_string: Byte :class:`str` to turn into valid xml output + :kwarg input_encoding: Encoding of :attr:`byte_string`. Default ``utf-8`` + :kwarg errors: How to handle errors encountered while decoding the + :attr:`byte_string` into :class:`unicode` at the beginning of the + process. Values are: + + :replace: (default) Replace the invalid bytes with a ``?`` + :ignore: Remove the characters altogether from the output + :strict: Raise an :exc:`UnicodeDecodeError` when we encounter + a non-decodable character + + :kwarg output_encoding: Encoding for the xml file that this string will go + into. Default is ``utf-8``. If all the characters in + :attr:`byte_string` are not encodable in this encoding, the unknown + characters will be entered into the output string using xml character + references. + :kwarg attrib: If :data:`True`, quote the string for use in an xml + attribute. If :data:`False` (default), quote for use in an xml text + field. + :kwarg control_chars: XML does not allow :term:`control characters`. When + we encounter those we need to know what to do. Valid options are: + + :replace: (default) Replace the :term:`control characters` with ``?`` + :ignore: Remove the characters altogether from the output + :strict: Raise an error when we encounter a :term:`control character` + + :raises XmlEncodeError: If :attr:`control_chars` is set to ``strict`` and + the string to be made suitable for output to xml contains + :term:`control characters` then we raise this exception. + :raises UnicodeDecodeError: If errors is set to ``strict`` and the + :attr:`byte_string` contains bytes that are not decodable using + :attr:`input_encoding`, this error is raised + :rtype: byte :class:`str` + :returns: representation of the byte :class:`str` in the output encoding with + any bytes that aren't available in xml taken care of. + + Use this when you have a byte :class:`str` representing text that you need + to make suitable for output to xml. There are several cases where this + is the case. For instance, if you need to transform some strings encoded + in ``latin-1`` to :term:`utf-8` for output:: + + utf8_string = byte_string_to_xml(latin1_string, input_encoding='latin-1') + + If you already have strings in the proper encoding you may still want to + use this function to remove :term:`control characters`:: + + cleaned_string = byte_string_to_xml(string, input_encoding='utf-8', output_encoding='utf-8') + + .. seealso:: + + :func:`unicode_to_xml` + for other ideas on using this function + ''' + if not isinstance(byte_string, str): + raise XmlEncodeError(k.b_('byte_string_to_xml can only take a byte' + ' string as its first argument. Use unicode_to_xml for' + ' unicode strings')) + + # Decode the string into unicode + u_string = unicode(byte_string, input_encoding, errors) + return unicode_to_xml(u_string, encoding=output_encoding, + attrib=attrib, control_chars=control_chars) + +def xml_to_byte_string(byte_string, input_encoding='utf-8', errors='replace', + output_encoding='utf-8'): + '''Transform a byte :class:`str` from an xml file into :class:`unicode` + string + + :arg byte_string: byte :class:`str` to decode + :kwarg input_encoding: encoding that the byte :class:`str` is in + :kwarg errors: What to do if not every character is valid in + :attr:`encoding`. See the :func:`to_unicode` docstring for legal + values. + :kwarg output_encoding: Encoding for the output byte :class:`str` + :returns: :class:`unicode` string decoded from :attr:`byte_string` + + This function attempts to reverse what :func:`unicode_to_xml` does. It + takes a byte :class:`str` (presumably read in from an xml file) and + expands all the html entities into unicode characters and decodes the + byte :class:`str` into a :class:`unicode` string. One thing it cannot do + is restore any :term:`control characters` that were removed prior to + inserting into the file. If you need to keep such characters you need to + use :func:`xml_to_bytes` and :func:`bytes_to_xml` or use one of the + strategies documented in :func:`unicode_to_xml` instead. + ''' + string = xml_to_unicode(byte_string, input_encoding, errors) + return to_bytes(string, output_encoding, errors) + +def bytes_to_xml(byte_string, *args, **kwargs): + '''Return a byte :class:`str` encoded so it is valid inside of any xml + file + + :arg byte_string: byte :class:`str` to transform + :arg \*args, \*\*kwargs: extra arguments to this function are passed on to + the function actually implementing the encoding. You can use this to + tweak the output in some cases but, as a general rule, you shouldn't + because the underlying encoding function is not guaranteed to remain + the same. + :rtype: byte :class:`str` consisting of all :term:`ASCII` characters + :returns: byte :class:`str` representation of the input. This will be encoded + using base64. + + This function is made especially to put binary information into xml + documents. + + This function is intended for encoding things that must be preserved + byte-for-byte. If you want to encode a byte string that's text and don't + mind losing the actual bytes you probably want to try :func:`byte_string_to_xml` + or :func:`guess_encoding_to_xml` instead. + + .. note:: + + Although the current implementation uses :func:`base64.b64encode` and + there's no plans to change it, that isn't guaranteed. If you want to + make sure that you can encode and decode these messages it's best to + use :func:`xml_to_bytes` if you use this function to encode. + ''' + # Can you do this yourself? Yes, you can. + return b64encode(byte_string, *args, **kwargs) + +def xml_to_bytes(byte_string, *args, **kwargs): + '''Decode a string encoded using :func:`bytes_to_xml` + + :arg byte_string: byte :class:`str` to transform. This should be a base64 + encoded sequence of bytes originally generated by :func:`bytes_to_xml`. + :arg \*args, \*\*kwargs: extra arguments to this function are passed on to + the function actually implementing the encoding. You can use this to + tweak the output in some cases but, as a general rule, you shouldn't + because the underlying encoding function is not guaranteed to remain + the same. + :rtype: byte :class:`str` + :returns: byte :class:`str` that's the decoded input + + If you've got fields in an xml document that were encoded with + :func:`bytes_to_xml` then you want to use this function to undecode them. + It converts a base64 encoded string into a byte :class:`str`. + + .. note:: + + Although the current implementation uses :func:`base64.b64decode` and + there's no plans to change it, that isn't guaranteed. If you want to + make sure that you can encode and decode these messages it's best to + use :func:`bytes_to_xml` if you use this function to decode. + ''' + return b64decode(byte_string, *args, **kwargs) + +def guess_encoding_to_xml(string, output_encoding='utf-8', attrib=False, + control_chars='replace'): + '''Return a byte :class:`str` suitable for inclusion in xml + + :arg string: :class:`unicode` or byte :class:`str` to be transformed into + a byte :class:`str` suitable for inclusion in xml. If string is + a byte :class:`str` we attempt to guess the encoding. If we cannot guess, + we fallback to ``latin-1``. + :kwarg output_encoding: Output encoding for the byte :class:`str`. This + should match the encoding of your xml file. + :kwarg attrib: If :data:`True`, escape the item for use in an xml + attribute. If :data:`False` (default) escape the item for use in + a text node. + :returns: :term:`utf-8` encoded byte :class:`str` + + ''' + # Unicode strings can just be run through unicode_to_xml() + if isinstance(string, unicode): + return unicode_to_xml(string, encoding=output_encoding, + attrib=attrib, control_chars=control_chars) + + # Guess the encoding of the byte strings + input_encoding = guess_encoding(string) + + # Return the new byte string + return byte_string_to_xml(string, input_encoding=input_encoding, + errors='replace', output_encoding=output_encoding, + attrib=attrib, control_chars=control_chars) + +def to_xml(string, encoding='utf-8', attrib=False, control_chars='ignore'): + '''*Deprecated*: Use :func:`guess_encoding_to_xml` instead + ''' + warnings.warn(k.b_('kitchen.text.converters.to_xml is deprecated. Use' + ' kitchen.text.converters.guess_encoding_to_xml instead.'), + DeprecationWarning, stacklevel=2) + return guess_encoding_to_xml(string, output_encoding=encoding, + attrib=attrib, control_chars=control_chars) + +__all__ = ('BYTE_EXCEPTION_CONVERTERS', 'EXCEPTION_CONVERTERS', + 'byte_string_to_xml', 'bytes_to_xml', 'exception_to_bytes', + 'exception_to_unicode', 'getwriter', 'guess_encoding_to_xml', + 'to_bytes', 'to_str', 'to_unicode', 'to_utf8', 'to_xml', + 'unicode_to_xml', 'xml_to_byte_string', 'xml_to_bytes', + 'xml_to_unicode') diff --git a/kitchen/text/display.py b/kitchen/text/display.py new file mode 100644 index 0000000..8624109 --- /dev/null +++ b/kitchen/text/display.py @@ -0,0 +1,901 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc. +# Copyright (c) 2010 Ville Skyttä +# Copyright (c) 2009 Tim Lauridsen +# Copyright (c) 2007 Marcus Kuhn +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: +# James Antill +# Marcus Kuhn +# Toshio Kuratomi +# Tim Lauridsen +# Ville Skyttä +# +# Portions of this are from yum/i18n.py +''' +----------------------- +Format Text for Display +----------------------- + +Functions related to displaying unicode text. Unicode characters don't all +have the same width so we need helper functions for displaying them. + +.. versionadded:: 0.2 kitchen.display API 1.0.0 +''' +import itertools +import unicodedata + +from kitchen import b_ +from kitchen.text.converters import to_unicode, to_bytes +from kitchen.text.exceptions import ControlCharError + +# This is ported from ustr_utf8_* which I got from: +# http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c +# I've tried to leave it close to the original C (same names etc.) so that +# it is easy to read/compare both versions... James Antilles + +# +# Reimplemented quite a bit of this for speed. Use the bzr log or annotate +# commands to see what I've changed since importing this file.-Toshio Kuratomi + +# ----------------------------- BEG utf8 ------------------to----------- +# This is an implementation of wcwidth() and wcswidth() (defined in +# IEEE Std 1002.1-2001) for Unicode. +# +# http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html +# http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html +# +# In fixed-width output devices, Latin characters all occupy a single +# "cell" position of equal width, whereas ideographic CJK characters +# occupy two such cells. Interoperability between terminal-line +# applications and (teletype-style) character terminals using the +# UTF-8 encoding requires agreement on which character should advance +# the cursor by how many cell positions. No established formal +# standards exist at present on which Unicode character shall occupy +# how many cell positions on character terminals. These routines are +# a first attempt of defining such behavior based on simple rules +# applied to data provided by the Unicode Consortium. +# +# [...] +# +# Markus Kuhn -- 2007-05-26 (Unicode 5.0) +# +# Permission to use, copy, modify, and distribute this software +# for any purpose and without fee is hereby granted. The author +# disclaims all warranties with regard to this software. +# +# Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c + +# Renamed but still pretty much JA's port of MK's code +def _interval_bisearch(value, table): + '''Binary search in an interval table. + + :arg value: numeric value to search for + :arg table: Ordered list of intervals. This is a list of two-tuples. The + elements of the two-tuple define an interval's start and end points. + :returns: If :attr:`value` is found within an interval in the :attr:`table` + return :data:`True`. Otherwise, :data:`False` + + This function checks whether a numeric value is present within a table + of intervals. It checks using a binary search algorithm, dividing the + list of values in half and checking against the values until it determines + whether the value is in the table. + ''' + minimum = 0 + maximum = len(table) - 1 + if value < table[minimum][0] or value > table[maximum][1]: + return False + + while maximum >= minimum: + mid = (minimum + maximum) / 2 + if value > table[mid][1]: + minimum = mid + 1 + elif value < table[mid][0]: + maximum = mid - 1 + else: + return True + + return False + +_COMBINING = ( + (0x300, 0x36f), (0x483, 0x489), (0x591, 0x5bd), + (0x5bf, 0x5bf), (0x5c1, 0x5c2), (0x5c4, 0x5c5), + (0x5c7, 0x5c7), (0x600, 0x603), (0x610, 0x61a), + (0x64b, 0x65e), (0x670, 0x670), (0x6d6, 0x6e4), + (0x6e7, 0x6e8), (0x6ea, 0x6ed), (0x70f, 0x70f), + (0x711, 0x711), (0x730, 0x74a), (0x7a6, 0x7b0), + (0x7eb, 0x7f3), (0x816, 0x819), (0x81b, 0x823), + (0x825, 0x827), (0x829, 0x82d), (0x901, 0x902), + (0x93c, 0x93c), (0x941, 0x948), (0x94d, 0x94d), + (0x951, 0x954), (0x962, 0x963), (0x981, 0x981), + (0x9bc, 0x9bc), (0x9c1, 0x9c4), (0x9cd, 0x9cd), + (0x9e2, 0x9e3), (0xa01, 0xa02), (0xa3c, 0xa3c), + (0xa41, 0xa42), (0xa47, 0xa48), (0xa4b, 0xa4d), + (0xa70, 0xa71), (0xa81, 0xa82), (0xabc, 0xabc), + (0xac1, 0xac5), (0xac7, 0xac8), (0xacd, 0xacd), + (0xae2, 0xae3), (0xb01, 0xb01), (0xb3c, 0xb3c), + (0xb3f, 0xb3f), (0xb41, 0xb43), (0xb4d, 0xb4d), + (0xb56, 0xb56), (0xb82, 0xb82), (0xbc0, 0xbc0), + (0xbcd, 0xbcd), (0xc3e, 0xc40), (0xc46, 0xc48), + (0xc4a, 0xc4d), (0xc55, 0xc56), (0xcbc, 0xcbc), + (0xcbf, 0xcbf), (0xcc6, 0xcc6), (0xccc, 0xccd), + (0xce2, 0xce3), (0xd41, 0xd43), (0xd4d, 0xd4d), + (0xdca, 0xdca), (0xdd2, 0xdd4), (0xdd6, 0xdd6), + (0xe31, 0xe31), (0xe34, 0xe3a), (0xe47, 0xe4e), + (0xeb1, 0xeb1), (0xeb4, 0xeb9), (0xebb, 0xebc), + (0xec8, 0xecd), (0xf18, 0xf19), (0xf35, 0xf35), + (0xf37, 0xf37), (0xf39, 0xf39), (0xf71, 0xf7e), + (0xf80, 0xf84), (0xf86, 0xf87), (0xf90, 0xf97), + (0xf99, 0xfbc), (0xfc6, 0xfc6), (0x102d, 0x1030), + (0x1032, 0x1032), (0x1036, 0x1037), (0x1039, 0x103a), + (0x1058, 0x1059), (0x108d, 0x108d), (0x1160, 0x11ff), + (0x135f, 0x135f), (0x1712, 0x1714), (0x1732, 0x1734), + (0x1752, 0x1753), (0x1772, 0x1773), (0x17b4, 0x17b5), + (0x17b7, 0x17bd), (0x17c6, 0x17c6), (0x17c9, 0x17d3), + (0x17dd, 0x17dd), (0x180b, 0x180d), (0x18a9, 0x18a9), + (0x1920, 0x1922), (0x1927, 0x1928), (0x1932, 0x1932), + (0x1939, 0x193b), (0x1a17, 0x1a18), (0x1a60, 0x1a60), + (0x1a75, 0x1a7c), (0x1a7f, 0x1a7f), (0x1b00, 0x1b03), + (0x1b34, 0x1b34), (0x1b36, 0x1b3a), (0x1b3c, 0x1b3c), + (0x1b42, 0x1b42), (0x1b44, 0x1b44), (0x1b6b, 0x1b73), + (0x1baa, 0x1baa), (0x1c37, 0x1c37), (0x1cd0, 0x1cd2), + (0x1cd4, 0x1ce0), (0x1ce2, 0x1ce8), (0x1ced, 0x1ced), + (0x1dc0, 0x1de6), (0x1dfd, 0x1dff), (0x200b, 0x200f), + (0x202a, 0x202e), (0x2060, 0x2063), (0x206a, 0x206f), + (0x20d0, 0x20f0), (0x2cef, 0x2cf1), (0x2de0, 0x2dff), + (0x302a, 0x302f), (0x3099, 0x309a), (0xa66f, 0xa66f), + (0xa67c, 0xa67d), (0xa6f0, 0xa6f1), (0xa806, 0xa806), + (0xa80b, 0xa80b), (0xa825, 0xa826), (0xa8c4, 0xa8c4), + (0xa8e0, 0xa8f1), (0xa92b, 0xa92d), (0xa953, 0xa953), + (0xa9b3, 0xa9b3), (0xa9c0, 0xa9c0), (0xaab0, 0xaab0), + (0xaab2, 0xaab4), (0xaab7, 0xaab8), (0xaabe, 0xaabf), + (0xaac1, 0xaac1), (0xabed, 0xabed), (0xfb1e, 0xfb1e), + (0xfe00, 0xfe0f), (0xfe20, 0xfe26), (0xfeff, 0xfeff), + (0xfff9, 0xfffb), (0x101fd, 0x101fd), (0x10a01, 0x10a03), + (0x10a05, 0x10a06), (0x10a0c, 0x10a0f), (0x10a38, 0x10a3a), + (0x10a3f, 0x10a3f), (0x110b9, 0x110ba), (0x1d165, 0x1d169), + (0x1d16d, 0x1d182), (0x1d185, 0x1d18b), (0x1d1aa, 0x1d1ad), + (0x1d242, 0x1d244), (0xe0001, 0xe0001), (0xe0020, 0xe007f), + (0xe0100, 0xe01ef), ) +''' +Internal table, provided by this module to list :term:`code points` which +combine with other characters and therefore should have no :term:`textual +width`. This is a sorted :class:`tuple` of non-overlapping intervals. Each +interval is a :class:`tuple` listing a starting :term:`code point` and ending +:term:`code point`. Every :term:`code point` between the two end points is +a combining character. + +.. seealso:: + + :func:`~kitchen.text.display._generate_combining_table` + for how this table is generated + +This table was last regenerated on python-2.7.0 with +:data:`unicodedata.unidata_version` 5.1.0 +''' + +# New function from Toshio Kuratomi (LGPLv2+) +def _generate_combining_table(): + '''Combine Markus Kuhn's data with :mod:`unicodedata` to make combining + char list + + :rtype: :class:`tuple` of tuples + :returns: :class:`tuple` of intervals of :term:`code points` that are + combining character. Each interval is a 2-:class:`tuple` of the + starting :term:`code point` and the ending :term:`code point` for the + combining characters. + + In normal use, this function serves to tell how we're generating the + combining char list. For speed reasons, we use this to generate a static + list and just use that later. + + Markus Kuhn's list of combining characters is more complete than what's in + the python :mod:`unicodedata` library but the python :mod:`unicodedata` is + synced against later versions of the unicode database + + This is used to generate the :data:`~kitchen.text.display._COMBINING` + table. + ''' + # Marcus Kuhn's sorted list of non-overlapping intervals of non-spacing + # characters generated ifrom Unicode 5.0 data by: + # "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" + markus_kuhn_combining_5_0 = ( + ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ), + ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ), + ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ), + ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ), + ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ), + ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ), + ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ), + ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ), + ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ), + ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ), + ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ), + ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ), + ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ), + ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ), + ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ), + ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ), + ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ), + ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ), + ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ), + ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ), + ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ), + ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ), + ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ), + ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ), + ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ), + ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ), + ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ), + ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ), + ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ), + ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ), + ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ), + ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ), + ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ), + ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ), + ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ), + ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ), + ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ), + ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ), + ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ), + ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ), + ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ), + ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ), + ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ), + ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ), + ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ), + ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ), + ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ), + ( 0xE0100, 0xE01EF )) + combining = [] + in_interval = False + interval = [] + for codepoint in xrange (0, 0xFFFFF + 1): + if _interval_bisearch(codepoint, markus_kuhn_combining_5_0) or \ + unicodedata.combining(unichr(codepoint)): + if not in_interval: + # Found first part of an interval + interval = [codepoint] + in_interval = True + else: + if in_interval: + in_interval = False + interval.append(codepoint - 1) + combining.append(interval) + + if in_interval: + # If we're at the end and the interval is open, close it. + # :W0631: We looped through a static range so we know codepoint is + # defined here + #pylint:disable-msg=W0631 + interval.append(codepoint) + combining.append(interval) + + return tuple(itertools.imap(tuple, combining)) + +# New function from Toshio Kuratomi (LGPLv2+) +def _print_combining_table(): + '''Print out a new :data:`_COMBINING` table + + This will print a new :data:`_COMBINING` table in the format used in + :file:`kitchen/text/display.py`. It's useful for updating the + :data:`_COMBINING` table with updated data from a new python as the format + won't change from what's already in the file. + ''' + table = _generate_combining_table() + entries = 0 + print '_COMBINING = (' + for pair in table: + if entries >= 3: + entries = 0 + print + if entries == 0: + print ' ', + entries += 1 + entry = '(0x%x, 0x%x),' % pair + print entry, + print ')' + +# Handling of control chars rewritten. Rest is JA's port of MK's C code. +# -Toshio Kuratomi +def _ucp_width(ucs, control_chars='guess'): + '''Get the :term:`textual width` of a ucs character + + :arg ucs: integer representing a single unicode :term:`code point` + :kwarg control_chars: specify how to deal with :term:`control characters`. + Possible values are: + + :guess: (default) will take a guess for :term:`control character` + widths. Most codes will return zero width. ``backspace``, + ``delete``, and ``clear delete`` return -1. ``escape`` currently + returns -1 as well but this is not guaranteed as it's not always + correct + :strict: will raise :exc:`~kitchen.text.exceptions.ControlCharError` + if a :term:`control character` is encountered + + :raises ControlCharError: if the :term:`code point` is a unicode + :term:`control character` and :attr:`control_chars` is set to 'strict' + :returns: :term:`textual width` of the character. + + .. note:: + + It's important to remember this is :term:`textual width` and not the + number of characters or bytes. + ''' + # test for 8-bit control characters + if ucs < 32 or (ucs < 0xa0 and ucs >= 0x7f): + # Control character detected + if control_chars == 'strict': + raise ControlCharError(b_('_ucp_width does not understand how to' + ' assign a width value to control characters.')) + if ucs in (0x08, 0x07F, 0x94): + # Backspace, delete, and clear delete remove a single character + return -1 + if ucs == 0x1b: + # Excape is tricky. It removes some number of characters that + # come after it but the amount is dependent on what is + # interpreting the code. + # So this is going to often be wrong but other values will be + # wrong as well. + return -1 + # All other control characters get 0 width + return 0 + + if _interval_bisearch(ucs, _COMBINING): + # Combining characters return 0 width as they will be combined with + # the width from other characters + return 0 + + # if we arrive here, ucs is not a combining or C0/C1 control character + + return (1 + + (ucs >= 0x1100 and + (ucs <= 0x115f or # Hangul Jamo init. consonants + ucs == 0x2329 or ucs == 0x232a or + (ucs >= 0x2e80 and ucs <= 0xa4cf and + ucs != 0x303f) or # CJK ... Yi + (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables + (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs + (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms + (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms + (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms + (ucs >= 0xffe0 and ucs <= 0xffe6) or + (ucs >= 0x20000 and ucs <= 0x2fffd) or + (ucs >= 0x30000 and ucs <= 0x3fffd)))) + +# Wholly rewritten by me (LGPLv2+) -Toshio Kuratomi +def textual_width(msg, control_chars='guess', encoding='utf-8', + errors='replace'): + '''Get the :term:`textual width` of a string + + :arg msg: :class:`unicode` string or byte :class:`str` to get the width of + :kwarg control_chars: specify how to deal with :term:`control characters`. + Possible values are: + + :guess: (default) will take a guess for :term:`control character` + widths. Most codes will return zero width. ``backspace``, + ``delete``, and ``clear delete`` return -1. ``escape`` currently + returns -1 as well but this is not guaranteed as it's not always + correct + :strict: will raise :exc:`kitchen.text.exceptions.ControlCharError` + if a :term:`control character` is encountered + + :kwarg encoding: If we are given a byte :class:`str` this is used to + decode it into :class:`unicode` string. Any characters that are not + decodable in this encoding will get a value dependent on the + :attr:`errors` parameter. + :kwarg errors: How to treat errors encoding the byte :class:`str` to + :class:`unicode` string. Legal values are the same as for + :func:`kitchen.text.converters.to_unicode`. The default value of + ``replace`` will cause undecodable byte sequences to have a width of + one. ``ignore`` will have a width of zero. + :raises ControlCharError: if :attr:`msg` contains a :term:`control + character` and :attr:`control_chars` is ``strict``. + :returns: :term:`Textual width` of the :attr:`msg`. This is the amount of + space that the string will consume on a monospace display. It's + measured in the number of cell positions or columns it will take up on + a monospace display. This is **not** the number of glyphs that are in + the string. + + .. note:: + + This function can be wrong sometimes because Unicode does not specify + a strict width value for all of the :term:`code points`. In + particular, we've found that some Tamil characters take up to four + character cells but we return a lesser amount. + ''' + # On python 2.6.4, x86_64, I've benchmarked a few alternate + # implementations:: + # + # timeit.repeat('display.textual_width(data)', + # 'from __main__ import display, data', number=100) + # I varied data by size and content (1MB of ascii, a few words, 43K utf8, + # unicode type + # + # :this implementation: fastest across the board + # + # :list comprehension: 6-16% slower + # return sum([_ucp_width(ord(c), control_chars=control_chars) + # for c in msg]) + # + # :generator expression: 9-18% slower + # return sum((_ucp_width(ord(c), control_chars=control_chars) for c in + # msg)) + # + # :lambda: 10-19% slower + # return sum(itertools.imap(lambda x: _ucp_width(ord(x), control_chars), + # msg)) + # + # :partial application: 13-22% slower + # func = functools.partial(_ucp_width, control_chars=control_chars) + # return sum(itertools.imap(func, itertools.imap(ord, msg))) + # + # :the original code: 4-38% slower + # The 4% was for the short, ascii only string. All the other pieces of + # data yielded over 30% slower times. + + # Non decodable data is just assigned a single cell width + msg = to_unicode(msg, encoding=encoding, errors=errors) + # Add the width of each char + return sum( + # calculate width of each char + itertools.starmap(_ucp_width, + # Setup the arguments to _ucp_width + itertools.izip( + # int value of each char + itertools.imap(ord, msg), + # control_chars arg in a form that izip will deal with + itertools.repeat(control_chars)))) + +# Wholly rewritten by me -Toshio Kuratomi +def textual_width_chop(msg, chop, encoding='utf-8', errors='replace'): + '''Given a string, return it chopped to a given :term:`textual width` + + :arg msg: :class:`unicode` string or byte :class:`str` to chop + :arg chop: Chop :attr:`msg` if it exceeds this :term:`textual width` + :kwarg encoding: If we are given a byte :class:`str`, this is used to + decode it into a :class:`unicode` string. Any characters that are not + decodable in this encoding will be assigned a width of one. + :kwarg errors: How to treat errors encoding the byte :class:`str` to + :class:`unicode`. Legal values are the same as for + :func:`kitchen.text.converters.to_unicode` + :rtype: :class:`unicode` string + :returns: :class:`unicode` string of the :attr:`msg` chopped at the given + :term:`textual width` + + This is what you want to use instead of ``%.*s``, as it does the "right" + thing with regard to :term:`UTF-8` sequences, :term:`control characters`, + and characters that take more than one cell position. Eg:: + + >>> # Wrong: only displays 8 characters because it is operating on bytes + >>> print "%.*s" % (10, 'café ñunru!') + café ñun + >>> # Properly operates on graphemes + >>> '%s' % (textual_width_chop('café ñunru!', 10)) + café ñunru + >>> # takes too many columns because the kanji need two cell positions + >>> print '1234567890\\n%.*s' % (10, u'一二三四五六七八九十') + 1234567890 + 一二三四五六七八九十 + >>> # Properly chops at 10 columns + >>> print '1234567890\\n%s' % (textual_width_chop(u'一二三四五六七八九十', 10)) + 1234567890 + 一二三四五 + + ''' + + msg = to_unicode(msg, encoding=encoding, errors=errors) + + width = textual_width(msg) + if width <= chop: + return msg + maximum = len(msg) + if maximum > chop * 2: + # A character can take at most 2 cell positions so this is the actual + # maximum + maximum = chop * 2 + minimum = 0 + eos = maximum + if eos > chop: + eos = chop + width = textual_width(msg[:eos]) + + while True: + # if current width is high, + if width > chop: + # calculate new midpoint + mid = minimum + (eos - minimum) / 2 + if mid == eos: + break + if (eos - chop) < (eos - mid): + while width > chop: + width = width - _ucp_width(ord(msg[eos-1])) + eos -= 1 + return msg[:eos] + # subtract distance between eos and mid from width + width = width - textual_width(msg[mid:eos]) + maximum = eos + eos = mid + # if current width is low, + elif width < chop: + # Note: at present, the if (eos - chop) < (eos - mid): + # short-circuit above means that we never use this branch. + + # calculate new midpoint + mid = eos + (maximum - eos) / 2 + if mid == eos: + break + if (chop - eos) < (mid - eos): + while width < chop: + new_width = _ucp_width(ord(msg[eos])) + width = width + new_width + eos += 1 + return msg[:eos] + + # add distance between eos and new mid to width + width = width + textual_width(msg[eos:mid]) + minimum = eos + eos = mid + if eos > maximum: + eos = maximum + break + # if current is just right + else: + return msg[:eos] + return msg[:eos] + +# I made some adjustments for using unicode but largely unchanged from JA's +# port of MK's code -Toshio +def textual_width_fill(msg, fill, chop=None, left=True, prefix='', suffix=''): + '''Expand a :class:`unicode` string to a specified :term:`textual width` + or chop to same + + :arg msg: :class:`unicode` string to format + :arg fill: pad string until the :term:`textual width` of the string is + this length + :kwarg chop: before doing anything else, chop the string to this length. + Default: Don't chop the string at all + :kwarg left: If :data:`True` (default) left justify the string and put the + padding on the right. If :data:`False`, pad on the left side. + :kwarg prefix: Attach this string before the field we're filling + :kwarg suffix: Append this string to the end of the field we're filling + :rtype: :class:`unicode` string + :returns: :attr:`msg` formatted to fill the specified width. If no + :attr:`chop` is specified, the string could exceed the fill length + when completed. If :attr:`prefix` or :attr:`suffix` are printable + characters, the string could be longer than the fill width. + + .. note:: + + :attr:`prefix` and :attr:`suffix` should be used for "invisible" + characters like highlighting, color changing escape codes, etc. The + fill characters are appended outside of any :attr:`prefix` or + :attr:`suffix` elements. This allows you to only highlight + :attr:`msg` inside of the field you're filling. + + .. warning:: + + :attr:`msg`, :attr:`prefix`, and :attr:`suffix` should all be + representable as unicode characters. In particular, any escape + sequences in :attr:`prefix` and :attr:`suffix` need to be convertible + to :class:`unicode`. If you need to use byte sequences here rather + than unicode characters, use + :func:`~kitchen.text.display.byte_string_textual_width_fill` instead. + + This function expands a string to fill a field of a particular + :term:`textual width`. Use it instead of ``%*.*s``, as it does the + "right" thing with regard to :term:`UTF-8` sequences, :term:`control + characters`, and characters that take more than one cell position in + a display. Example usage:: + + >>> msg = u'一二三四五六七八九十' + >>> # Wrong: This uses 10 characters instead of 10 cells: + >>> u":%-*.*s:" % (10, 10, msg[:9]) + :一二三四五六七八九 : + >>> # This uses 10 cells like we really want: + >>> u":%s:" % (textual_width_fill(msg[:9], 10, 10)) + :一二三四五: + + >>> # Wrong: Right aligned in the field, but too many cells + >>> u"%20.10s" % (msg) + 一二三四五六七八九十 + >>> # Correct: Right aligned with proper number of cells + >>> u"%s" % (textual_width_fill(msg, 20, 10, left=False)) + 一二三四五 + + >>> # Wrong: Adding some escape characters to highlight the line but too many cells + >>> u"%s%20.10s%s" % (prefix, msg, suffix) + u'\x1b[7m 一二三四五六七八九十\x1b[0m' + >>> # Correct highlight of the line + >>> u"%s%s%s" % (prefix, display.textual_width_fill(msg, 20, 10, left=False), suffix) + u'\x1b[7m 一二三四五\x1b[0m' + + >>> # Correct way to not highlight the fill + >>> u"%s" % (display.textual_width_fill(msg, 20, 10, left=False, prefix=prefix, suffix=suffix)) + u' \x1b[7m一二三四五\x1b[0m' + ''' + msg = to_unicode(msg) + if chop is not None: + msg = textual_width_chop(msg, chop) + width = textual_width(msg) + if width >= fill: + if prefix or suffix: + msg = u''.join([prefix, msg, suffix]) + else: + extra = u' ' * (fill - width) + if left: + msg = u''.join([prefix, msg, suffix, extra]) + else: + msg = u''.join([extra, prefix, msg, suffix]) + return msg + +def _textual_width_le(width, *args): + '''Optimize the common case when deciding which :term:`textual width` is + larger + + :arg width: :term:`textual width` to compare against. + :arg \*args: :class:`unicode` strings to check the total :term:`textual + width` of + :returns: :data:`True` if the total length of :attr:`args` are less than + or equal to :attr:`width`. Otherwise :data:`False`. + + We often want to know "does X fit in Y". It takes a while to use + :func:`textual_width` to calculate this. However, we know that the number + of canonically composed :class:`unicode` characters is always going to + have 1 or 2 for the :term:`textual width` per character. With this we can + take the following shortcuts: + + 1) If the number of canonically composed characters is more than width, + the true :term:`textual width` cannot be less than width. + 2) If the number of canonically composed characters * 2 is less than the + width then the :term:`textual width` must be ok. + + :term:`textual width` of a canonically composed :class:`unicode` string + will always be greater than or equal to the the number of :class:`unicode` + characters. So we can first check if the number of composed + :class:`unicode` characters is less than the asked for width. If it is we + can return :data:`True` immediately. If not, then we must do a full + :term:`textual width` lookup. + ''' + string = ''.join(args) + string = unicodedata.normalize('NFC', string) + if len(string) > width: + return False + elif len(string) * 2 <= width: + return True + elif len(to_bytes(string)) <= width: + # Check against bytes. + # utf8 has the property of having the same amount or more bytes per + # character than textual width. + return True + else: + true_width = textual_width(string) + return true_width <= width + +def wrap(text, width=70, initial_indent=u'', subsequent_indent=u'', + encoding='utf-8', errors='replace'): + '''Works like we want :func:`textwrap.wrap` to work, + + :arg text: :class:`unicode` string or byte :class:`str` to wrap + :kwarg width: :term:`textual width` at which to wrap. Default: 70 + :kwarg initial_indent: string to use to indent the first line. Default: + do not indent. + :kwarg subsequent_indent: string to use to wrap subsequent lines. + Default: do not indent + :kwarg encoding: Encoding to use if :attr:`text` is a byte :class:`str` + :kwarg errors: error handler to use if :attr:`text` is a byte :class:`str` + and contains some undecodable characters. + :rtype: :class:`list` of :class:`unicode` strings + :returns: list of lines that have been text wrapped and indented. + + :func:`textwrap.wrap` from the |stdlib|_ has two drawbacks that this + attempts to fix: + + 1. It does not handle :term:`textual width`. It only operates on bytes or + characters which are both inadequate (due to multi-byte and double + width characters). + 2. It malforms lists and blocks. + ''' + # Tested with: + # yum info robodoc gpicview php-pear-Net-Socket wmctrl ustr moreutils + # mediawiki-HNP ocspd insight yum mousepad + # ...at 120, 80 and 40 chars. + # Also, notable among lots of others, searching for "\n ": + # exim-clamav, jpackage-utils, tcldom, synaptics, "quake3", + # perl-Class-Container, ez-ipupdate, perl-Net-XMPP, "kipi-plugins", + # perl-Apache-DBI, netcdf, python-configobj, "translate-toolkit", alpine, + # "udunits", "conntrack-tools" + # + # Note that, we "fail" on: + # alsa-plugins-jack, setools*, dblatex, uisp, "perl-Getopt-GUI-Long", + # suitesparse, "synce-serial", writer2latex, xenwatch, ltsp-utils + + def _indent_at_beg(line): + '''Return the indent to use for this and (possibly) subsequent lines + + :arg line: :class:`unicode` line of text to process + :rtype: tuple + :returns: tuple of count of whitespace before getting to the start of + this line followed by a count to the following indent if this + block of text is an entry in a list. + ''' + # Find the first non-whitespace character + try: + char = line.strip()[0] + except IndexError: + # All whitespace + return 0, 0 + else: + count = line.find(char) + + # if we have a bullet character, check for list + if char not in u'-*.o\u2022\u2023\u2218': + # No bullet; not a list + return count, 0 + + # List: Keep searching until we hit the innermost list + nxt = _indent_at_beg(line[count+1:]) + nxt = nxt[1] or nxt[0] + if nxt: + return count, count + 1 + nxt + return count, 0 + + initial_indent = to_unicode(initial_indent, encoding=encoding, + errors=errors) + subsequent_indent = to_unicode(subsequent_indent, encoding=encoding, + errors=errors) + subsequent_indent_width = textual_width(subsequent_indent) + + text = to_unicode(text, encoding=encoding, errors=errors).rstrip(u'\n') + lines = text.expandtabs().split(u'\n') + + ret = [] + indent = initial_indent + wrap_last = False + cur_sab = 0 + cur_spc_indent = 0 + for line in lines: + line = line.rstrip(u' ') + (last_sab, last_spc_indent) = (cur_sab, cur_spc_indent) + (cur_sab, cur_spc_indent) = _indent_at_beg(line) + force_nl = False # We want to stop wrapping under "certain" conditions: + if wrap_last and cur_spc_indent: # if line starts a list or + force_nl = True + if wrap_last and cur_sab == len(line):# is empty line + force_nl = True + if wrap_last and not last_spc_indent: # if we don't continue a list + if cur_sab >= 4 and cur_sab != last_sab: # and is "block indented" + force_nl = True + if force_nl: + ret.append(indent.rstrip(u' ')) + indent = subsequent_indent + wrap_last = False + if cur_sab == len(line): # empty line, remove spaces to make it easier. + line = u'' + if wrap_last: + line = line.lstrip(u' ') + cur_spc_indent = last_spc_indent + + if _textual_width_le(width, indent, line): + wrap_last = False + ret.append(indent + line) + indent = subsequent_indent + continue + + wrap_last = True + words = line.split(u' ') + line = indent + spcs = cur_spc_indent + if not spcs and cur_sab >= 4: + spcs = cur_sab + for word in words: + if (not _textual_width_le(width, line, word) and + textual_width(line) > subsequent_indent_width): + ret.append(line.rstrip(u' ')) + line = subsequent_indent + u' ' * spcs + line += word + line += u' ' + indent = line.rstrip(u' ') + u' ' + if wrap_last: + ret.append(indent.rstrip(u' ')) + + return ret + +def fill(text, *args, **kwargs): + '''Works like we want :func:`textwrap.fill` to work + + :arg text: :class:`unicode` string or byte :class:`str` to process + :returns: :class:`unicode` string with each line separated by a newline + + .. seealso:: + + :func:`kitchen.text.display.wrap` + for other parameters that you can give this command. + + This function is a light wrapper around :func:`kitchen.text.display.wrap`. + Where that function returns a :class:`list` of lines, this function + returns one string with each line separated by a newline. + ''' + return u'\n'.join(wrap(text, *args, **kwargs)) + +# +# Byte strings +# + +def byte_string_textual_width_fill(msg, fill, chop=None, left=True, prefix='', + suffix='', encoding='utf-8', errors='replace'): + '''Expand a byte :class:`str` to a specified :term:`textual width` or chop + to same + + :arg msg: byte :class:`str` encoded in :term:`UTF-8` that we want formatted + :arg fill: pad :attr:`msg` until the :term:`textual width` is this long + :kwarg chop: before doing anything else, chop the string to this length. + Default: Don't chop the string at all + :kwarg left: If :data:`True` (default) left justify the string and put the + padding on the right. If :data:`False`, pad on the left side. + :kwarg prefix: Attach this byte :class:`str` before the field we're + filling + :kwarg suffix: Append this byte :class:`str` to the end of the field we're + filling + :rtype: byte :class:`str` + :returns: :attr:`msg` formatted to fill the specified :term:`textual + width`. If no :attr:`chop` is specified, the string could exceed the + fill length when completed. If :attr:`prefix` or :attr:`suffix` are + printable characters, the string could be longer than fill width. + + .. note:: + + :attr:`prefix` and :attr:`suffix` should be used for "invisible" + characters like highlighting, color changing escape codes, etc. The + fill characters are appended outside of any :attr:`prefix` or + :attr:`suffix` elements. This allows you to only highlight + :attr:`msg` inside of the field you're filling. + + .. seealso:: + + :func:`~kitchen.text.display.textual_width_fill` + For example usage. This function has only two differences. + + 1. it takes byte :class:`str` for :attr:`prefix` and + :attr:`suffix` so you can pass in arbitrary sequences of + bytes, not just unicode characters. + 2. it returns a byte :class:`str` instead of a :class:`unicode` + string. + ''' + prefix = to_bytes(prefix, encoding=encoding, errors=errors) + suffix = to_bytes(suffix, encoding=encoding, errors=errors) + + if chop is not None: + msg = textual_width_chop(msg, chop, encoding=encoding, errors=errors) + width = textual_width(msg) + msg = to_bytes(msg) + + if width >= fill: + if prefix or suffix: + msg = ''.join([prefix, msg, suffix]) + else: + extra = ' ' * (fill - width) + if left: + msg = ''.join([prefix, msg, suffix, extra]) + else: + msg = ''.join([extra, prefix, msg, suffix]) + + return msg + +__all__ = ('byte_string_textual_width_fill', 'fill', 'textual_width', + 'textual_width_chop', 'textual_width_fill', 'wrap') diff --git a/kitchen/text/exceptions.py b/kitchen/text/exceptions.py new file mode 100644 index 0000000..74468cf --- /dev/null +++ b/kitchen/text/exceptions.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2010 Red Hat, Inc +# +# kitchen is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# kitchen is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +# +''' +----------------------- +Kitchen.text exceptions +----------------------- + +Exception classes thrown by kitchen's text processing routines. +''' +from kitchen import exceptions + +class XmlEncodeError(exceptions.KitchenError): + '''Exception thrown by error conditions when encoding an xml string. + ''' + pass + +class ControlCharError(exceptions.KitchenError): + '''Exception thrown when an ascii control character is encountered. + ''' + pass + +__all__ = ('XmlEncodeError', 'ControlCharError') diff --git a/kitchen/text/misc.py b/kitchen/text/misc.py new file mode 100644 index 0000000..ca1be44 --- /dev/null +++ b/kitchen/text/misc.py @@ -0,0 +1,313 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2011 Red Hat, Inc +# Copyright (c) 2010 Seth Vidal +# +# kitchen is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# kitchen is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with kitchen; if not, see +# +# Authors: +# James Antill +# Toshio Kuratomi +# Seth Vidal +# +# Portions of this code taken from yum/misc.py and yum/i18n.py +''' +--------------------------------------------- +Miscellaneous functions for manipulating text +--------------------------------------------- + +Collection of text functions that don't fit in another category. +''' +import htmlentitydefs +import itertools +import re + +try: + import chardet +except ImportError: + chardet = None + +# We need to access b_() for localizing our strings but we'll end up with +# a circular import if we import it directly. +import kitchen as k +from kitchen.pycompat24 import sets +from kitchen.text.exceptions import ControlCharError + +sets.add_builtin_set() + +# Define a threshold for chardet confidence. If we fall below this we decode +# byte strings we're guessing about as latin1 +_CHARDET_THRESHHOLD = 0.6 + +# ASCII control codes that are illegal in xml 1.0 +_CONTROL_CODES = frozenset(range(0, 8) + [11, 12] + range(14, 32)) +_CONTROL_CHARS = frozenset(itertools.imap(unichr, _CONTROL_CODES)) + +# _ENTITY_RE +_ENTITY_RE = re.compile(r'(?s)<[^>]*>|&#?\w+;') + +def guess_encoding(byte_string, disable_chardet=False): + '''Try to guess the encoding of a byte :class:`str` + + :arg byte_string: byte :class:`str` to guess the encoding of + :kwarg disable_chardet: If this is True, we never attempt to use + :mod:`chardet` to guess the encoding. This is useful if you need to + have reproducibility whether :mod:`chardet` is installed or not. + Default: :data:`False`. + :raises TypeError: if :attr:`byte_string` is not a byte :class:`str` type + :returns: string containing a guess at the encoding of + :attr:`byte_string`. This is appropriate to pass as the encoding + argument when encoding and decoding unicode strings. + + We start by attempting to decode the byte :class:`str` as :term:`UTF-8`. + If this succeeds we tell the world it's :term:`UTF-8` text. If it doesn't + and :mod:`chardet` is installed on the system and :attr:`disable_chardet` + is False this function will use it to try detecting the encoding of + :attr:`byte_string`. If it is not installed or :mod:`chardet` cannot + determine the encoding with a high enough confidence then we rather + arbitrarily claim that it is ``latin-1``. Since ``latin-1`` will encode + to every byte, decoding from ``latin-1`` to :class:`unicode` will not + cause :exc:`UnicodeErrors` although the output might be mangled. + ''' + if not isinstance(byte_string, str): + raise TypeError(k.b_('byte_string must be a byte string (str)')) + input_encoding = 'utf-8' + try: + unicode(byte_string, input_encoding, 'strict') + except UnicodeDecodeError: + input_encoding = None + + if not input_encoding and chardet and not disable_chardet: + detection_info = chardet.detect(byte_string) + if detection_info['confidence'] >= _CHARDET_THRESHHOLD: + input_encoding = detection_info['encoding'] + + if not input_encoding: + input_encoding = 'latin-1' + + return input_encoding + +def str_eq(str1, str2, encoding='utf-8', errors='replace'): + '''Compare two stringsi, converting to byte :class:`str` if one is + :class:`unicode` + + :arg str1: First string to compare + :arg str2: Second string to compare + :kwarg encoding: If we need to convert one string into a byte :class:`str` + to compare, the encoding to use. Default is :term:`utf-8`. + :kwarg errors: What to do if we encounter errors when encoding the string. + See the :func:`kitchen.text.converters.to_bytes` documentation for + possible values. The default is ``replace``. + + This function prevents :exc:`UnicodeError` (python-2.4 or less) and + :exc:`UnicodeWarning` (python 2.5 and higher) when we compare + a :class:`unicode` string to a byte :class:`str`. The errors normally + arise because the conversion is done to :term:`ASCII`. This function + lets you convert to :term:`utf-8` or another encoding instead. + + .. note:: + + When we need to convert one of the strings from :class:`unicode` in + order to compare them we convert the :class:`unicode` string into + a byte :class:`str`. That means that strings can compare differently + if you use different encodings for each. + + Note that ``str1 == str2`` is faster than this function if you can accept + the following limitations: + + * Limited to python-2.5+ (otherwise a :exc:`UnicodeDecodeError` may be + thrown) + * Will generate a :exc:`UnicodeWarning` if non-:term:`ASCII` byte + :class:`str` is compared to :class:`unicode` string. + ''' + try: + return (not str1 < str2) and (not str1 > str2) + except UnicodeError: + pass + + if isinstance(str1, unicode): + str1 = str1.encode(encoding, errors) + else: + str2 = str2.encode(encoding, errors) + if str1 == str2: + return True + + return False + +def process_control_chars(string, strategy='replace'): + '''Look for and transform :term:`control characters` in a string + + :arg string: string to search for and transform :term:`control characters` + within + :kwarg strategy: XML does not allow :term:`ASCII` :term:`control + characters`. When we encounter those we need to know what to do. + Valid options are: + + :replace: (default) Replace the :term:`control characters` + with ``"?"`` + :ignore: Remove the characters altogether from the output + :strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when + we encounter a control character + :raises TypeError: if :attr:`string` is not a unicode string. + :raises ValueError: if the strategy is not one of replace, ignore, or + strict. + :raises kitchen.text.exceptions.ControlCharError: if the strategy is + ``strict`` and a :term:`control character` is present in the + :attr:`string` + :returns: :class:`unicode` string with no :term:`control characters` in + it. + ''' + if not isinstance(string, unicode): + raise TypeError(k.b_('process_control_char must have a unicode type as' + ' the first argument.')) + if strategy == 'ignore': + control_table = dict(zip(_CONTROL_CODES, [None] * len(_CONTROL_CODES))) + elif strategy == 'replace': + control_table = dict(zip(_CONTROL_CODES, [u'?'] * len(_CONTROL_CODES))) + elif strategy == 'strict': + control_table = None + # Test that there are no control codes present + data = frozenset(string) + if [c for c in _CONTROL_CHARS if c in data]: + raise ControlCharError(k.b_('ASCII control code present in string' + ' input')) + else: + raise ValueError(k.b_('The strategy argument to process_control_chars' + ' must be one of ignore, replace, or strict')) + + if control_table: + string = string.translate(control_table) + + return string + +# Originally written by Fredrik Lundh (January 15, 2003) and placed in the +# public domain:: +# +# Unless otherwise noted, source code can be be used freely. Examples, test +# scripts and other short code fragments can be considered as being in the +# public domain. +# +# http://effbot.org/zone/re-sub.htm#unescape-html +# http://effbot.org/zone/copyright.htm +# +def html_entities_unescape(string): + '''Substitute unicode characters for HTML entities + + :arg string: :class:`unicode` string to substitute out html entities + :raises TypeError: if something other than a :class:`unicode` string is + given + :rtype: :class:`unicode` string + :returns: The plain text without html entities + ''' + def fixup(match): + string = match.group(0) + if string[:1] == u"<": + return "" # ignore tags + if string[:2] == u"&#": + try: + if string[:3] == u"&#x": + return unichr(int(string[3:-1], 16)) + else: + return unichr(int(string[2:-1])) + except ValueError: + # If the value is outside the unicode codepoint range, leave + # it in the output as is + pass + elif string[:1] == u"&": + entity = htmlentitydefs.entitydefs.get(string[1:-1].encode('utf-8')) + if entity: + if entity[:2] == "&#": + try: + return unichr(int(entity[2:-1])) + except ValueError: + # If the value is outside the unicode codepoint range, + # leave it in the output as is + pass + else: + return unicode(entity, "iso-8859-1") + return string # leave as is + + if not isinstance(string, unicode): + raise TypeError(k.b_('html_entities_unescape must have a unicode type' + ' for its first argument')) + return re.sub(_ENTITY_RE, fixup, string) + +def byte_string_valid_xml(byte_string, encoding='utf-8'): + '''Check that a byte :class:`str` would be valid in xml + + :arg byte_string: Byte :class:`str` to check + :arg encoding: Encoding of the xml file. Default: :term:`UTF-8` + :returns: :data:`True` if the string is valid. :data:`False` if it would + be invalid in the xml file + + In some cases you'll have a whole bunch of byte strings and rather than + transforming them to :class:`unicode` and back to byte :class:`str` for + output to xml, you will just want to make sure they work with the xml file + you're constructing. This function will help you do that. Example:: + + ARRAY_OF_MOSTLY_UTF8_STRINGS = [...] + processed_array = [] + for string in ARRAY_OF_MOSTLY_UTF8_STRINGS: + if byte_string_valid_xml(string, 'utf-8'): + processed_array.append(string) + else: + processed_array.append(guess_bytes_to_xml(string, encoding='utf-8')) + output_xml(processed_array) + ''' + if not isinstance(byte_string, str): + # Not a byte string + return False + + try: + u_string = unicode(byte_string, encoding) + except UnicodeError: + # Not encoded with the xml file's encoding + return False + + data = frozenset(u_string) + if data.intersection(_CONTROL_CHARS): + # Contains control codes + return False + + # The byte string is compatible with this xml file + return True + +def byte_string_valid_encoding(byte_string, encoding='utf-8'): + '''Detect if a byte :class:`str` is valid in a specific encoding + + :arg byte_string: Byte :class:`str` to test for bytes not valid in this + encoding + :kwarg encoding: encoding to test against. Defaults to :term:`UTF-8`. + :returns: :data:`True` if there are no invalid :term:`UTF-8` characters. + :data:`False` if an invalid character is detected. + + .. note:: + + This function checks whether the byte :class:`str` is valid in the + specified encoding. It **does not** detect whether the byte + :class:`str` actually was encoded in that encoding. If you want that + sort of functionality, you probably want to use + :func:`~kitchen.text.misc.guess_encoding` instead. + ''' + try: + unicode(byte_string, encoding) + except UnicodeError: + # Not encoded with the xml file's encoding + return False + + # byte string is valid in this encoding + return True + +__all__ = ('byte_string_valid_encoding', 'byte_string_valid_xml', + 'guess_encoding', 'html_entities_unescape', 'process_control_chars', + 'str_eq') diff --git a/kitchen/text/utf8.py b/kitchen/text/utf8.py new file mode 100644 index 0000000..53d37b1 --- /dev/null +++ b/kitchen/text/utf8.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2011 Red Hat, Inc. +# Copyright (c) 2010 Ville Skyttä +# Copyright (c) 2009 Tim Lauridsen +# Copyright (c) 2007 Marcus Kuhn +# +# kitchen is free software; you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 2.1 of the License, or (at your option) +# any later version. +# +# kitchen is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for +# more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with kitchen; if not, see +# +# Authors: +# James Antill +# Marcus Kuhn +# Toshio Kuratomi +# Tim Lauridsen +# Ville Skyttä +# +# Portions of this are from yum/i18n.py +''' +----- +UTF-8 +----- + +Functions for operating on byte :class:`str` encoded as :term:`UTF-8` + +.. note:: + + In many cases, it is better to convert to :class:`unicode`, operate on the + strings, then convert back to :term:`UTF-8`. :class:`unicode` type can + handle many of these functions itself. For those that it doesn't + (removing control characters from length calculations, for instance) the + code to do so with a :class:`unicode` type is often simpler. + +.. warning:: + + All of the functions in this module are deprecated. Most of them have + been replaced with functions that operate on unicode values in + :mod:`kitchen.text.display`. :func:`kitchen.text.utf8.utf8_valid` has + been replaced with a function in :mod:`kitchen.text.misc`. +''' +import warnings + +from kitchen import b_ +from kitchen.text.converters import to_unicode, to_bytes +from kitchen.text.misc import byte_string_valid_encoding +from kitchen.text.display import _textual_width_le, \ + byte_string_textual_width_fill, fill, textual_width, \ + textual_width_chop, wrap + +# +# Deprecated functions +# + +def utf8_valid(msg): + '''**Deprecated** Detect if a string is valid :term:`utf-8` + + Use :func:`kitchen.text.misc.byte_string_valid_encoding` instead. + ''' + warnings.warn(b_('kitchen.text.utf8.utf8_valid is deprecated. Use' + ' kitchen.text.misc.byte_string_valid_encoding(msg) instead'), + DeprecationWarning, stacklevel=2) + return byte_string_valid_encoding(msg) + +def utf8_width(msg): + '''**Deprecated** Get the :term:`textual width` of a :term:`utf-8` string + + Use :func:`kitchen.text.display.textual_width` instead. + ''' + warnings.warn(b_('kitchen.text.utf8.utf8_width is deprecated. Use' + ' kitchen.text.display.textual_width(msg) instead'), + DeprecationWarning, stacklevel=2) + return textual_width(msg) + + +def utf8_width_chop(msg, chop=None): + '''**Deprecated** Return a string chopped to a given :term:`textual width` + + Use :func:`~kitchen.text.display.textual_width_chop` and + :func:`~kitchen.text.display.textual_width` instead:: + + >>> msg = 'く ku ら ra と to み mi' + >>> # Old way: + >>> utf8_width_chop(msg, 5) + (5, 'く ku') + >>> # New way + >>> from kitchen.text.converters import to_bytes + >>> from kitchen.text.display import textual_width, textual_width_chop + >>> (textual_width(msg), to_bytes(textual_width_chop(msg, 5))) + (5, 'く ku') + ''' + warnings.warn(b_('kitchen.text.utf8.utf8_width_chop is deprecated. Use' + ' kitchen.text.display.textual_width_chop instead'), DeprecationWarning, + stacklevel=2) + + if chop == None: + return textual_width(msg), msg + + as_bytes = not isinstance(msg, unicode) + + chopped_msg = textual_width_chop(msg, chop) + if as_bytes: + chopped_msg = to_bytes(chopped_msg) + return textual_width(chopped_msg), chopped_msg + +def utf8_width_fill(msg, fill, chop=None, left=True, prefix='', suffix=''): + '''**Deprecated** Pad a :term:`utf-8` string to fill a specified width + + Use :func:`~kitchen.text.display.byte_string_textual_width_fill` instead + ''' + warnings.warn(b_('kitchen.text.utf8.utf8_width_fill is deprecated. Use' + ' kitchen.text.display.byte_string_textual_width_fill instead'), + DeprecationWarning, stacklevel=2) + + return byte_string_textual_width_fill(msg, fill, chop=chop, left=left, + prefix=prefix, suffix=suffix) + +def utf8_text_wrap(text, width=70, initial_indent='', subsequent_indent=''): + '''**Deprecated** Similar to :func:`textwrap.wrap` but understands + :term:`utf-8` data and doesn't screw up lists/blocks/etc + + Use :func:`kitchen.text.display.wrap` instead + ''' + warnings.warn(b_('kitchen.text.utf8.utf8_text_wrap is deprecated. Use' + ' kitchen.text.display.wrap instead'), + DeprecationWarning, stacklevel=2) + + as_bytes = not isinstance(text, unicode) + + text = to_unicode(text) + lines = wrap(text, width=width, initial_indent=initial_indent, + subsequent_indent=subsequent_indent) + if as_bytes: + lines = [to_bytes(m) for m in lines] + + return lines + +def utf8_text_fill(text, *args, **kwargs): + '''**Deprecated** Similar to :func:`textwrap.fill` but understands + :term:`utf-8` strings and doesn't screw up lists/blocks/etc. + + Use :func:`kitchen.text.display.fill` instead. + ''' + warnings.warn(b_('kitchen.text.utf8.utf8_text_fill is deprecated. Use' + ' kitchen.text.display.fill instead'), + DeprecationWarning, stacklevel=2) + # This assumes that all args. are utf8. + return fill(text, *args, **kwargs) + +def _utf8_width_le(width, *args): + '''**Deprecated** Convert the arguments to unicode and use + :func:`kitchen.text.display._textual_width_le` instead. + ''' + warnings.warn(b_('kitchen.text.utf8._utf8_width_le is deprecated. Use' + ' kitchen.text.display._textual_width_le instead'), + DeprecationWarning, stacklevel=2) + # This assumes that all args. are utf8. + return _textual_width_le(width, to_unicode(''.join(args))) + +__all__ = ('utf8_text_fill', 'utf8_text_wrap', 'utf8_valid', 'utf8_width', + 'utf8_width_chop', 'utf8_width_fill') diff --git a/kitchen/versioning/__init__.py b/kitchen/versioning/__init__.py new file mode 100644 index 0000000..69bc21c --- /dev/null +++ b/kitchen/versioning/__init__.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2011 Red Hat, Inc +# +# kitchen is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# kitchen is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with kitchen; if not, see +# +# Authors: +# Toshio Kuratomi +''' +---------------------------- +PEP-386 compliant versioning +---------------------------- + +:pep:`386` defines a standard format for version strings. This module +contains a function for creating strings in that format. +''' +__version_info__ = ((1, 0, 0),) + +import itertools + +def version_tuple_to_string(version_info): + '''Return a :pep:`386` version string from a :pep:`386` style version tuple + + :arg version_info: Nested set of tuples that describes the version. See + below for an example. + :returns: a version string + + This function implements just enough of :pep:`386` to satisfy our needs. + :pep:`386` defines a standard format for version strings and refers to + a function that will be merged into the |stdlib|_ that transforms a tuple + of version information into a standard version string. This function is + an implementation of that function. Once that function becomes available + in the |stdlib|_ we will start using it and deprecate this function. + + :attr:`version_info` takes the form that :pep:`386`'s + :func:`NormalizedVersion.from_parts` uses:: + + ((Major, Minor, [Micros]), [(Alpha/Beta/rc marker, version)], + [(post/dev marker, version)]) + + Ex: ((1, 0, 0), ('a', 2), ('dev', 3456)) + + It generates a :pep:`386` compliant version string:: + + N.N[.N]+[{a|b|c|rc}N[.N]+][.postN][.devN] + + Ex: 1.0.0a2.dev3456 + + .. warning:: This function does next to no error checking. It's up to the + person defining the version tuple to make sure that the values make + sense. If the :pep:`386` compliant version parser doesn't get + released soon we'll look at making this function check that the + version tuple makes sense before transforming it into a string. + + It's recommended that you use this function to keep + a :data:`__version_info__` tuple and :data:`__version__` string in your + modules. Why do we need both a tuple and a string? The string is often + useful for putting into human readable locations like release + announcements, version strings in tarballs, etc. Meanwhile the tuple is + very easy for a computer to compare. For example, kitchen sets up its + version information like this:: + + from kitchen.versioning import version_tuple_to_string + __version_info__ = ((0, 2, 1),) + __version__ = version_tuple_to_string(__version_info__) + + Other programs that depend on a kitchen version between 0.2.1 and 0.3.0 + can find whether the present version is okay with code like this:: + + from kitchen import __version_info__, __version__ + if __version_info__ < ((0, 2, 1),) or __version_info__ >= ((0, 3, 0),): + print 'kitchen is present but not at the right version.' + print 'We need at least version 0.2.1 and less than 0.3.0' + print 'Currently found: kitchen-%s' % __version__ + ''' + ver_components = [] + for values in version_info: + if isinstance(values[0], int): + ver_components.append('.'.join(itertools.imap(str, values))) + else: + if isinstance(values[0], unicode): + modifier = values[0].encode('ascii') + else: + modifier = values[0] + if modifier in ('a', 'b', 'c', 'rc'): + ver_components.append('%s%s' % (modifier, + '.'.join(itertools.imap(str, values[1:])) or '0')) + else: + ver_components.append('.%s%s' % (modifier, + str(values[1]))) + return unicode(''.join(ver_components), 'ascii') + + +__version__ = version_tuple_to_string(__version_info__) + +__all__ = ('version_tuple_to_string',) diff --git a/locale/de/LC_MESSAGES/kitchen.mo b/locale/de/LC_MESSAGES/kitchen.mo new file mode 100644 index 0000000..5bcf178 Binary files /dev/null and b/locale/de/LC_MESSAGES/kitchen.mo differ diff --git a/locale/en_US/LC_MESSAGES/kitchen.mo b/locale/en_US/LC_MESSAGES/kitchen.mo new file mode 100644 index 0000000..885cda4 Binary files /dev/null and b/locale/en_US/LC_MESSAGES/kitchen.mo differ diff --git a/po/de.po b/po/de.po new file mode 100644 index 0000000..2d6fa87 --- /dev/null +++ b/po/de.po @@ -0,0 +1,184 @@ +# Translations template for PROJECT. +# Copyright (C) 2012 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# +# Translators: +# Christoph Scheid , 2012. +msgid "" +msgstr "" +"Project-Id-Version: Kitchen: Miscellaneous, useful python code\n" +"Report-Msgid-Bugs-To: https://fedorahosted.org/kitchen/\n" +"POT-Creation-Date: 2012-01-03 18:23-0800\n" +"PO-Revision-Date: 2012-01-13 20:39+0000\n" +"Last-Translator: Christoph Scheid \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.6\n" +"Language: de\n" +"Plural-Forms: nplurals=2; plural=(n != 1)\n" + +#: kitchen/release.py:9 +msgid "Kitchen contains a cornucopia of useful code" +msgstr "Kitchen ist ein Füllhorn voller nützlichem Code." + +#: kitchen/release.py:10 +msgid "" +"\n" +"We've all done it. In the process of writing a brand new application we've\n" +"discovered that we need a little bit of code that we've invented before.\n" +"Perhaps it's something to handle unicode text. Perhaps it's something to make\n" +"a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up being\n" +"a tiny bit of code that seems too small to worry about pushing into its own\n" +"module so it sits there, a part of your current project, waiting to be cut and\n" +"pasted into your next project. And the next. And the next. And since that\n" +"little bittybit of code proved so useful to you, it's highly likely that it\n" +"proved useful to someone else as well. Useful enough that they've written it\n" +"and copy and pasted it over and over into each of their new projects.\n" +"\n" +"Well, no longer! Kitchen aims to pull these small snippets of code into a few\n" +"python modules which you can import and use within your project. No more copy\n" +"and paste! Now you can let someone else maintain and release these small\n" +"snippets so that you can get on with your life.\n" +msgstr "" + +#: kitchen/pycompat25/collections/_defaultdict.py:93 +msgid "First argument must be callable" +msgstr "Das erste Argument muss ausführbar (callable) sein." + +#: kitchen/text/converters.py:140 +msgid "" +"non_string is a deprecated parameter of to_unicode(). Use nonstring instead" +msgstr "non_string ist ein veralteter Parameter von to_unicode(). Stattdessen nonstring verwenden." + +#: kitchen/text/converters.py:174 +#, python-format +msgid "" +"to_unicode was given \"%(obj)s\" which is neither a byte string (str) or a " +"unicode string" +msgstr "" + +#: kitchen/text/converters.py:178 kitchen/text/converters.py:297 +#, python-format +msgid "nonstring value, %(param)s, is not set to a valid action" +msgstr "" + +#: kitchen/text/converters.py:255 +msgid "" +"non_string is a deprecated parameter of to_bytes(). Use nonstring instead" +msgstr "" + +#: kitchen/text/converters.py:294 +#, python-format +msgid "" +"to_bytes was given \"%(obj)s\" which is neither a unicode string or a byte " +"string (str)" +msgstr "" + +#: kitchen/text/converters.py:378 +msgid "" +"kitchen.text.converters.to_utf8 is deprecated. Use " +"kitchen.text.converters.to_bytes(obj, encoding=\"utf-8\", " +"nonstring=\"passthru\" instead." +msgstr "" + +#: kitchen/text/converters.py:403 +msgid "" +"to_str is deprecated. Use to_unicode or to_bytes instead. See the to_str " +"docstring for porting information." +msgstr "" + +#: kitchen/text/converters.py:685 +msgid "" +"unicode_to_xml must have a unicode type as the first argument. Use " +"bytes_string_to_xml for byte strings." +msgstr "" + +#: kitchen/text/converters.py:689 +msgid "" +"The control_chars argument to unicode_to_xml must be one of ignore, replace," +" or strict" +msgstr "" + +#: kitchen/text/converters.py:786 +msgid "" +"byte_string_to_xml can only take a byte string as its first argument. Use " +"unicode_to_xml for unicode strings" +msgstr "" + +#: kitchen/text/converters.py:910 +msgid "" +"kitchen.text.converters.to_xml is deprecated. Use " +"kitchen.text.converters.guess_encoding_to_xml instead." +msgstr "" + +#: kitchen/text/display.py:344 +msgid "" +"_ucp_width does not understand how to assign a width value to control " +"characters." +msgstr "" + +#: kitchen/text/misc.py:83 +msgid "byte_string must be a byte string (str)" +msgstr "" + +#: kitchen/text/misc.py:171 +msgid "process_control_char must have a unicode type as the first argument." +msgstr "" + +#: kitchen/text/misc.py:182 +msgid "ASCII control code present in string input" +msgstr "" + +#: kitchen/text/misc.py:185 +msgid "" +"The strategy argument to process_control_chars must be one of ignore, " +"replace, or strict" +msgstr "" + +#: kitchen/text/misc.py:241 +msgid "html_entities_unescape must have a unicode type for its first argument" +msgstr "" + +#: kitchen/text/utf8.py:69 +msgid "" +"kitchen.text.utf8.utf8_valid is deprecated. Use " +"kitchen.text.misc.byte_string_valid_encoding(msg) instead" +msgstr "" + +#: kitchen/text/utf8.py:79 +msgid "" +"kitchen.text.utf8.utf8_width is deprecated. Use " +"kitchen.text.display.textual_width(msg) instead" +msgstr "" + +#: kitchen/text/utf8.py:101 +msgid "" +"kitchen.text.utf8.utf8_width_chop is deprecated. Use " +"kitchen.text.display.textual_width_chop instead" +msgstr "" + +#: kitchen/text/utf8.py:120 +msgid "" +"kitchen.text.utf8.utf8_width_fill is deprecated. Use " +"kitchen.text.display.byte_string_textual_width_fill instead" +msgstr "" + +#: kitchen/text/utf8.py:133 +msgid "" +"kitchen.text.utf8.utf8_text_wrap is deprecated. Use " +"kitchen.text.display.wrap instead" +msgstr "" + +#: kitchen/text/utf8.py:153 +msgid "" +"kitchen.text.utf8.utf8_text_fill is deprecated. Use " +"kitchen.text.display.fill instead" +msgstr "" + +#: kitchen/text/utf8.py:163 +msgid "" +"kitchen.text.utf8._utf8_width_le is deprecated. Use " +"kitchen.text.display._textual_width_le instead" +msgstr "" diff --git a/po/en_US.po b/po/en_US.po new file mode 100644 index 0000000..d1e1cd6 --- /dev/null +++ b/po/en_US.po @@ -0,0 +1,239 @@ +# Translations template for PROJECT. +# Copyright (C) 2012 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# +# Translators: +msgid "" +msgstr "" +"Project-Id-Version: Kitchen: Miscellaneous, useful python code\n" +"Report-Msgid-Bugs-To: https://fedorahosted.org/kitchen/\n" +"POT-Creation-Date: 2012-01-03 18:23-0800\n" +"PO-Revision-Date: 2012-01-03 07:48+0000\n" +"Last-Translator: Toshio Kuratomi \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.6\n" +"Language: en_US\n" +"Plural-Forms: nplurals=2; plural=(n != 1)\n" + +#: kitchen/release.py:9 +msgid "Kitchen contains a cornucopia of useful code" +msgstr "Kitchen contains a cornucopia of useful code" + +#: kitchen/release.py:10 +msgid "" +"\n" +"We've all done it. In the process of writing a brand new application we've\n" +"discovered that we need a little bit of code that we've invented before.\n" +"Perhaps it's something to handle unicode text. Perhaps it's something to make\n" +"a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up being\n" +"a tiny bit of code that seems too small to worry about pushing into its own\n" +"module so it sits there, a part of your current project, waiting to be cut and\n" +"pasted into your next project. And the next. And the next. And since that\n" +"little bittybit of code proved so useful to you, it's highly likely that it\n" +"proved useful to someone else as well. Useful enough that they've written it\n" +"and copy and pasted it over and over into each of their new projects.\n" +"\n" +"Well, no longer! Kitchen aims to pull these small snippets of code into a few\n" +"python modules which you can import and use within your project. No more copy\n" +"and paste! Now you can let someone else maintain and release these small\n" +"snippets so that you can get on with your life.\n" +msgstr "" +"\n" +"We've all done it. In the process of writing a brand new application we've\n" +"discovered that we need a little bit of code that we've invented before.\n" +"Perhaps it's something to handle unicode text. Perhaps it's something to make\n" +"a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up being\n" +"a tiny bit of code that seems too small to worry about pushing into its own\n" +"module so it sits there, a part of your current project, waiting to be cut and\n" +"pasted into your next project. And the next. And the next. And since that\n" +"little bittybit of code proved so useful to you, it's highly likely that it\n" +"proved useful to someone else as well. Useful enough that they've written it\n" +"and copy and pasted it over and over into each of their new projects.\n" +"\n" +"Well, no longer! Kitchen aims to pull these small snippets of code into a few\n" +"python modules which you can import and use within your project. No more copy\n" +"and paste! Now you can let someone else maintain and release these small\n" +"snippets so that you can get on with your life.\n" + +#: kitchen/pycompat25/collections/_defaultdict.py:93 +msgid "First argument must be callable" +msgstr "First argument must be callable" + +#: kitchen/text/converters.py:140 +msgid "" +"non_string is a deprecated parameter of to_unicode(). Use nonstring instead" +msgstr "" +"non_string is a deprecated parameter of to_unicode(). Use nonstring instead" + +#: kitchen/text/converters.py:174 +#, python-format +msgid "" +"to_unicode was given \"%(obj)s\" which is neither a byte string (str) or a " +"unicode string" +msgstr "" +"to_unicode was given \"%(obj)s\" which is neither a byte string (str) or a " +"unicode string" + +#: kitchen/text/converters.py:178 kitchen/text/converters.py:297 +#, python-format +msgid "nonstring value, %(param)s, is not set to a valid action" +msgstr "nonstring value, %(param)s, is not set to a valid action" + +#: kitchen/text/converters.py:255 +msgid "" +"non_string is a deprecated parameter of to_bytes(). Use nonstring instead" +msgstr "" +"non_string is a deprecated parameter of to_bytes(). Use nonstring instead" + +#: kitchen/text/converters.py:294 +#, python-format +msgid "" +"to_bytes was given \"%(obj)s\" which is neither a unicode string or a byte " +"string (str)" +msgstr "" +"to_bytes was given \"%(obj)s\" which is neither a unicode string or a byte " +"string (str)" + +#: kitchen/text/converters.py:378 +msgid "" +"kitchen.text.converters.to_utf8 is deprecated. Use " +"kitchen.text.converters.to_bytes(obj, encoding=\"utf-8\", " +"nonstring=\"passthru\" instead." +msgstr "" +"kitchen.text.converters.to_utf8 is deprecated. Use " +"kitchen.text.converters.to_bytes(obj, encoding=\"utf-8\", " +"nonstring=\"passthru\" instead." + +#: kitchen/text/converters.py:403 +msgid "" +"to_str is deprecated. Use to_unicode or to_bytes instead. See the to_str " +"docstring for porting information." +msgstr "" +"to_str is deprecated. Use to_unicode or to_bytes instead. See the to_str " +"docstring for porting information." + +#: kitchen/text/converters.py:685 +msgid "" +"unicode_to_xml must have a unicode type as the first argument. Use " +"bytes_string_to_xml for byte strings." +msgstr "" +"unicode_to_xml must have a unicode type as the first argument. Use " +"bytes_string_to_xml for byte strings." + +#: kitchen/text/converters.py:689 +msgid "" +"The control_chars argument to unicode_to_xml must be one of ignore, replace," +" or strict" +msgstr "" +"The control_chars argument to unicode_to_xml must be one of ignore, replace," +" or strict" + +#: kitchen/text/converters.py:786 +msgid "" +"byte_string_to_xml can only take a byte string as its first argument. Use " +"unicode_to_xml for unicode strings" +msgstr "" +"byte_string_to_xml can only take a byte string as its first argument. Use " +"unicode_to_xml for unicode strings" + +#: kitchen/text/converters.py:910 +msgid "" +"kitchen.text.converters.to_xml is deprecated. Use " +"kitchen.text.converters.guess_encoding_to_xml instead." +msgstr "" +"kitchen.text.converters.to_xml is deprecated. Use " +"kitchen.text.converters.guess_encoding_to_xml instead." + +#: kitchen/text/display.py:344 +msgid "" +"_ucp_width does not understand how to assign a width value to control " +"characters." +msgstr "" +"_ucp_width does not understand how to assign a width value to control " +"characters." + +#: kitchen/text/misc.py:83 +msgid "byte_string must be a byte string (str)" +msgstr "byte_string must be a byte string (str)" + +#: kitchen/text/misc.py:171 +msgid "process_control_char must have a unicode type as the first argument." +msgstr "process_control_char must have a unicode type as the first argument." + +#: kitchen/text/misc.py:182 +msgid "ASCII control code present in string input" +msgstr "ASCII control code present in string input" + +#: kitchen/text/misc.py:185 +msgid "" +"The strategy argument to process_control_chars must be one of ignore, " +"replace, or strict" +msgstr "" +"The strategy argument to process_control_chars must be one of ignore, " +"replace, or strict" + +#: kitchen/text/misc.py:241 +msgid "html_entities_unescape must have a unicode type for its first argument" +msgstr "" +"html_entities_unescape must have a unicode type for its first argument" + +#: kitchen/text/utf8.py:69 +msgid "" +"kitchen.text.utf8.utf8_valid is deprecated. Use " +"kitchen.text.misc.byte_string_valid_encoding(msg) instead" +msgstr "" +"kitchen.text.utf8.utf8_valid is deprecated. Use " +"kitchen.text.misc.byte_string_valid_encoding(msg) instead" + +#: kitchen/text/utf8.py:79 +msgid "" +"kitchen.text.utf8.utf8_width is deprecated. Use " +"kitchen.text.display.textual_width(msg) instead" +msgstr "" +"kitchen.text.utf8.utf8_width is deprecated. Use " +"kitchen.text.display.textual_width(msg) instead" + +#: kitchen/text/utf8.py:101 +msgid "" +"kitchen.text.utf8.utf8_width_chop is deprecated. Use " +"kitchen.text.display.textual_width_chop instead" +msgstr "" +"kitchen.text.utf8.utf8_width_chop is deprecated. Use " +"kitchen.text.display.textual_width_chop instead" + +#: kitchen/text/utf8.py:120 +msgid "" +"kitchen.text.utf8.utf8_width_fill is deprecated. Use " +"kitchen.text.display.byte_string_textual_width_fill instead" +msgstr "" +"kitchen.text.utf8.utf8_width_fill is deprecated. Use " +"kitchen.text.display.byte_string_textual_width_fill instead" + +#: kitchen/text/utf8.py:133 +msgid "" +"kitchen.text.utf8.utf8_text_wrap is deprecated. Use " +"kitchen.text.display.wrap instead" +msgstr "" +"kitchen.text.utf8.utf8_text_wrap is deprecated. Use " +"kitchen.text.display.wrap instead" + +#: kitchen/text/utf8.py:153 +msgid "" +"kitchen.text.utf8.utf8_text_fill is deprecated. Use " +"kitchen.text.display.fill instead" +msgstr "" +"kitchen.text.utf8.utf8_text_fill is deprecated. Use " +"kitchen.text.display.fill instead" + +#: kitchen/text/utf8.py:163 +msgid "" +"kitchen.text.utf8._utf8_width_le is deprecated. Use " +"kitchen.text.display._textual_width_le instead" +msgstr "" +"kitchen.text.utf8._utf8_width_le is deprecated. Use " +"kitchen.text.display._textual_width_le instead" + + diff --git a/po/kitchen.pot b/po/kitchen.pot new file mode 100644 index 0000000..dd0df3f --- /dev/null +++ b/po/kitchen.pot @@ -0,0 +1,194 @@ +# Translations template for PROJECT. +# Copyright (C) 2012 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2012. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2012-01-03 18:23-0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.6\n" + +#: kitchen/release.py:9 +msgid "Kitchen contains a cornucopia of useful code" +msgstr "" + +#: kitchen/release.py:10 +msgid "" +"\n" +"We've all done it. In the process of writing a brand new application " +"we've\n" +"discovered that we need a little bit of code that we've invented before.\n" +"Perhaps it's something to handle unicode text. Perhaps it's something to" +" make\n" +"a bit of python-2.5 code run on python-2.3. Whatever it is, it ends up " +"being\n" +"a tiny bit of code that seems too small to worry about pushing into its " +"own\n" +"module so it sits there, a part of your current project, waiting to be " +"cut and\n" +"pasted into your next project. And the next. And the next. And since " +"that\n" +"little bittybit of code proved so useful to you, it's highly likely that " +"it\n" +"proved useful to someone else as well. Useful enough that they've " +"written it\n" +"and copy and pasted it over and over into each of their new projects.\n" +"\n" +"Well, no longer! Kitchen aims to pull these small snippets of code into " +"a few\n" +"python modules which you can import and use within your project. No more" +" copy\n" +"and paste! Now you can let someone else maintain and release these small" +"\n" +"snippets so that you can get on with your life.\n" +msgstr "" + +#: kitchen/pycompat25/collections/_defaultdict.py:93 +msgid "First argument must be callable" +msgstr "" + +#: kitchen/text/converters.py:140 +msgid "" +"non_string is a deprecated parameter of to_unicode(). Use nonstring " +"instead" +msgstr "" + +#: kitchen/text/converters.py:174 +#, python-format +msgid "" +"to_unicode was given \"%(obj)s\" which is neither a byte string (str) or " +"a unicode string" +msgstr "" + +#: kitchen/text/converters.py:178 kitchen/text/converters.py:297 +#, python-format +msgid "nonstring value, %(param)s, is not set to a valid action" +msgstr "" + +#: kitchen/text/converters.py:255 +msgid "non_string is a deprecated parameter of to_bytes(). Use nonstring instead" +msgstr "" + +#: kitchen/text/converters.py:294 +#, python-format +msgid "" +"to_bytes was given \"%(obj)s\" which is neither a unicode string or a " +"byte string (str)" +msgstr "" + +#: kitchen/text/converters.py:378 +msgid "" +"kitchen.text.converters.to_utf8 is deprecated. Use " +"kitchen.text.converters.to_bytes(obj, encoding=\"utf-8\", " +"nonstring=\"passthru\" instead." +msgstr "" + +#: kitchen/text/converters.py:403 +msgid "" +"to_str is deprecated. Use to_unicode or to_bytes instead. See the " +"to_str docstring for porting information." +msgstr "" + +#: kitchen/text/converters.py:685 +msgid "" +"unicode_to_xml must have a unicode type as the first argument. Use " +"bytes_string_to_xml for byte strings." +msgstr "" + +#: kitchen/text/converters.py:689 +msgid "" +"The control_chars argument to unicode_to_xml must be one of ignore, " +"replace, or strict" +msgstr "" + +#: kitchen/text/converters.py:786 +msgid "" +"byte_string_to_xml can only take a byte string as its first argument. " +"Use unicode_to_xml for unicode strings" +msgstr "" + +#: kitchen/text/converters.py:910 +msgid "" +"kitchen.text.converters.to_xml is deprecated. Use " +"kitchen.text.converters.guess_encoding_to_xml instead." +msgstr "" + +#: kitchen/text/display.py:344 +msgid "" +"_ucp_width does not understand how to assign a width value to control " +"characters." +msgstr "" + +#: kitchen/text/misc.py:83 +msgid "byte_string must be a byte string (str)" +msgstr "" + +#: kitchen/text/misc.py:171 +msgid "process_control_char must have a unicode type as the first argument." +msgstr "" + +#: kitchen/text/misc.py:182 +msgid "ASCII control code present in string input" +msgstr "" + +#: kitchen/text/misc.py:185 +msgid "" +"The strategy argument to process_control_chars must be one of ignore, " +"replace, or strict" +msgstr "" + +#: kitchen/text/misc.py:241 +msgid "html_entities_unescape must have a unicode type for its first argument" +msgstr "" + +#: kitchen/text/utf8.py:69 +msgid "" +"kitchen.text.utf8.utf8_valid is deprecated. Use " +"kitchen.text.misc.byte_string_valid_encoding(msg) instead" +msgstr "" + +#: kitchen/text/utf8.py:79 +msgid "" +"kitchen.text.utf8.utf8_width is deprecated. Use " +"kitchen.text.display.textual_width(msg) instead" +msgstr "" + +#: kitchen/text/utf8.py:101 +msgid "" +"kitchen.text.utf8.utf8_width_chop is deprecated. Use " +"kitchen.text.display.textual_width_chop instead" +msgstr "" + +#: kitchen/text/utf8.py:120 +msgid "" +"kitchen.text.utf8.utf8_width_fill is deprecated. Use " +"kitchen.text.display.byte_string_textual_width_fill instead" +msgstr "" + +#: kitchen/text/utf8.py:133 +msgid "" +"kitchen.text.utf8.utf8_text_wrap is deprecated. Use " +"kitchen.text.display.wrap instead" +msgstr "" + +#: kitchen/text/utf8.py:153 +msgid "" +"kitchen.text.utf8.utf8_text_fill is deprecated. Use " +"kitchen.text.display.fill instead" +msgstr "" + +#: kitchen/text/utf8.py:163 +msgid "" +"kitchen.text.utf8._utf8_width_le is deprecated. Use " +"kitchen.text.display._textual_width_le instead" +msgstr "" + diff --git a/releaseutils.py b/releaseutils.py new file mode 100755 index 0000000..d10d62e --- /dev/null +++ b/releaseutils.py @@ -0,0 +1,64 @@ +#!/usr/bin/python -tt + +import ConfigParser +import glob +import os +import shutil +from kitchen.pycompat27 import subprocess + +class MsgFmt(object): + def run(self, args): + cmd = subprocess.Popen(args, shell=False) + cmd.wait() + +def setup_message_compiler(): + # Look for msgfmt + try: + subprocess.Popen(['msgfmt', '-h'], stdout=subprocess.PIPE) + except OSError: + import babel.messages.frontend + + return (babel.messages.frontend.CommandLineInterface(), + 'pybabel compile -D %(domain)s -d locale -i %(pofile)s -l %(lang)s' + ) + else: + return (MsgFmt(), 'msgfmt -c -o locale/%(lang)s/LC_MESSAGES/%(domain)s.mo %(pofile)s') + +def main(): + # Get the directory with message catalogs + # Reuse transifex's config file first as it will know this + cfg = ConfigParser.SafeConfigParser() + cfg.read('.tx/config') + cmd, args = setup_message_compiler() + + try: + shutil.rmtree('locale') + except OSError, e: + # If the error is that locale does not exist, we're okay. We're + # deleting it here, afterall + if e.errno != 2: + raise + + for section in [s for s in cfg.sections() if s != 'main']: + try: + file_filter = cfg.get(section, 'file_filter') + source_file = cfg.get(section, 'source_file') + except ConfigParser.NoOptionError: + continue + glob_pattern = file_filter.replace('', '*') + pot = os.path.basename(source_file) + if pot.endswith('.pot'): + pot = pot[:-4] + arg_values = {'domain': pot} + for po_file in glob.glob(glob_pattern): + file_pattern = os.path.basename(po_file) + lang = file_pattern.replace('.po','') + os.makedirs(os.path.join('locale', lang, 'LC_MESSAGES')) + arg_values['pofile'] = po_file + arg_values['lang'] = lang + compile_args = args % arg_values + compile_args = compile_args.split(' ') + cmd.run(compile_args) + +if __name__ == '__main__': + main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b3655d1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[upload_docs] +upload_dir=build/sphinx/html diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..238af78 --- /dev/null +++ b/setup.py @@ -0,0 +1,57 @@ +#!/usr/bin/python -tt +# -*- coding: utf-8 -*- + +from distutils.command.sdist import sdist as _sdist +import glob +import os + +from setuptools import find_packages, setup +import kitchen.release + +import releaseutils + +# Override sdist command to compile the message catalogs as well +class Sdist(_sdist, object): + def run(self): + releaseutils.main() + data_files = [] + for langfile in filter(os.path.isfile, glob.glob('locale/*/*/*.mo')): + data_files.append((os.path.dirname(langfile), [langfile])) + if self.distribution.data_files and \ + hasattr(self.distribution.data_files, 'extend'): + self.distribution.data_files.extend(data_files) + else: + self.distribution.data_files = data_files + super(Sdist, self).run() + + +setup(name='kitchen', + version=str(kitchen.release.__version__), + description=kitchen.release.DESCRIPTION, + long_description=kitchen.release.LONG_DESCRIPTION, + author=kitchen.release.AUTHOR, + author_email=kitchen.release.EMAIL, + maintainer='Toshio Kuratomi', + maintainer_email='toshio@fedoraproject.org', + license=kitchen.release.LICENSE, + url=kitchen.release.URL, + download_url=kitchen.release.DOWNLOAD_URL, + cmdclass={'sdist': Sdist + }, + keywords='Useful Small Code Snippets', + classifiers=[ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 2.3', + 'Programming Language :: Python :: 2.4', + 'Programming Language :: Python :: 2.5', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Topic :: Software Development :: Internationalization', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: General', + ], + packages=find_packages(), + data_files=[], +) diff --git a/tests/base_classes.py b/tests/base_classes.py new file mode 100644 index 0000000..32dc367 --- /dev/null +++ b/tests/base_classes.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +# +# Base class for testing unicode and utf8 functions. This holds data that's +# useful for making tests + +import re + +from kitchen.text.converters import to_bytes +from kitchen.text import misc + +class UnicodeTestData(object): + # This should encode fine -- sanity check + u_ascii = u'the quick brown fox jumped over the lazy dog' + b_ascii = 'the quick brown fox jumped over the lazy dog' + + # First challenge -- what happens with latin-1 characters + u_spanish = u'El veloz murciélago saltó sobre el perro perezoso.' + # utf8 and latin1 both support these chars so no mangling + utf8_spanish = u_spanish.encode('utf8') + latin1_spanish = u_spanish.encode('latin1') + + # ASCII does not have the accented characters so it mangles + ascii_mangled_spanish_as_ascii = u_spanish.encode('ascii', 'replace') + # Attempting to decode using the wrong charset will mangle + # Note: as a general principle, we do not want to have code that mangles + # input of one charset and output of the same charset. We want to avoid + # things like:: + # input latin-1, transform to unicode with utf-8, output latin-1. + u_mangled_spanish_utf8_as_latin1 = unicode(utf8_spanish, encoding='latin1', errors='replace') + u_mangled_spanish_utf8_as_ascii = unicode(utf8_spanish, encoding='ascii', errors='replace') + u_mangled_spanish_latin1_as_ascii = unicode(latin1_spanish, encoding='ascii', errors='replace') + u_mangled_spanish_latin1_as_utf8 = unicode(latin1_spanish, encoding='utf-8', errors='replace') + ascii_twice_mangled_spanish_latin1_as_utf8_as_ascii = u_mangled_spanish_latin1_as_utf8.encode('ascii', 'replace') + utf8_mangled_spanish_latin1_as_utf8 = u_mangled_spanish_latin1_as_utf8.encode('utf-8') + u_spanish_ignore = unicode(latin1_spanish, encoding='utf8', errors='ignore') + + u_japanese = u"速い茶色のキツネが怠惰な犬に'増" + utf8_japanese = u_japanese.encode('utf8') + euc_jp_japanese = u_japanese.encode('euc_jp') + u_mangled_euc_jp_as_latin1 = unicode(euc_jp_japanese, 'latin1') + u_mangled_euc_jp_as_utf8 = unicode(euc_jp_japanese, 'utf-8', 'replace') + utf8_mangled_euc_jp_as_latin1 = u_mangled_euc_jp_as_latin1.encode('utf8') + u_mangled_japanese_utf8_as_latin1 = unicode(utf8_japanese, 'latin1') + u_mangled_japanese_utf8_as_ascii = u"������������������������������������������'���" + ascii_mangled_japanese_replace_as_latin1 = "??????????????'?" + latin1_mangled_japanese_replace_as_latin1 = "??????????????'?" + + u_mixed = u'く ku ら ra と to み mi' + utf8_mixed = u_mixed.encode('utf8') + utf8_ku = u_mixed[0].encode('utf8') + utf8_ra = u_mixed[2].encode('utf8') + utf8_to = u_mixed[4].encode('utf8') + utf8_mi = u_mixed[6].encode('utf8') + + u_mixed_replace = u'\ufffd ku \ufffd ra \ufffd to \ufffd mi' + u_mixed_ignore = u' ku ra to mi' + latin1_mixed_replace = '? ku ? ra ? to ? mi' + latin1_mixed_ignore = ' ku ra to mi' + + u_entity = u'Test: <"&"> – ' + u_japanese + u'é' + utf8_entity = u_entity.encode('utf8') + u_entity_escape = u'Test: <"&"> – ' + unicode(u_japanese.encode('ascii', 'xmlcharrefreplace'), 'ascii') + u'é' + utf8_entity_escape = 'Test: <"&"> – 速い茶色のキツネが怠惰な犬に\'増é' + utf8_attrib_escape = 'Test: <"&"> – 速い茶色のキツネが怠惰な犬に\'増é' + ascii_entity_escape = (u'Test: <"&"> – ' + u_japanese + u'é').encode('ascii', 'xmlcharrefreplace').replace('&', '&',1).replace('<', '<').replace('>', '>') + + b_byte_chars = ' '.join(map(chr, range(0, 256))) + b_byte_encoded = 'ACABIAIgAyAEIAUgBiAHIAggCSAKIAsgDCANIA4gDyAQIBEgEiATIBQgFSAWIBcgGCAZIBogGyAcIB0gHiAfICAgISAiICMgJCAlICYgJyAoICkgKiArICwgLSAuIC8gMCAxIDIgMyA0IDUgNiA3IDggOSA6IDsgPCA9ID4gPyBAIEEgQiBDIEQgRSBGIEcgSCBJIEogSyBMIE0gTiBPIFAgUSBSIFMgVCBVIFYgVyBYIFkgWiBbIFwgXSBeIF8gYCBhIGIgYyBkIGUgZiBnIGggaSBqIGsgbCBtIG4gbyBwIHEgciBzIHQgdSB2IHcgeCB5IHogeyB8IH0gfiB/IIAggSCCIIMghCCFIIYghyCIIIkgiiCLIIwgjSCOII8gkCCRIJIgkyCUIJUgliCXIJggmSCaIJsgnCCdIJ4gnyCgIKEgoiCjIKQgpSCmIKcgqCCpIKogqyCsIK0griCvILAgsSCyILMgtCC1ILYgtyC4ILkguiC7ILwgvSC+IL8gwCDBIMIgwyDEIMUgxiDHIMggySDKIMsgzCDNIM4gzyDQINEg0iDTINQg1SDWINcg2CDZINog2yDcIN0g3iDfIOAg4SDiIOMg5CDlIOYg5yDoIOkg6iDrIOwg7SDuIO8g8CDxIPIg8yD0IPUg9iD3IPgg+SD6IPsg/CD9IP4g/w==' + + repr_re = re.compile('^<[^ ]*\.([^.]+) object at .*>$') + + u_paragraph = u'''ConfigObj is a simple but powerful config file reader and writer: an ini file +round tripper. Its main feature is that it is very easy to use, with a +straightforward programmer's interface and a simple syntax for config files. +It has lots of other features though: + + + + * Nested sections (subsections), to any level + * List values + * Multiple line values + * String interpolation (substitution) + * Integrated with a powerful validation system + o including automatic type checking/conversion + o repeated sections + o and allowing default values + * All comments in the file are preserved + * The order of keys/sections is preserved + * No external dependencies + * Full Unicode support + * A powerful unrepr mode for storing basic datatypes +''' + utf8_paragraph = u_paragraph.encode('utf-8', 'replace') + u_paragraph_out = [u'ConfigObj is a simple but powerful config file reader and writer: an', +u'ini file round tripper. Its main feature is that it is very easy to', +u"use, with a straightforward programmer's interface and a simple syntax", +u'for config files. It has lots of other features though:', +u'', +u'', +u'', +u' * Nested sections (subsections), to any level', +u' * List values', +u' * Multiple line values', +u' * String interpolation (substitution)', +u' * Integrated with a powerful validation system', +u' o including automatic type checking/conversion', +u' o repeated sections', +u' o and allowing default values', +u' * All comments in the file are preserved', +u' * The order of keys/sections is preserved', +u' * No external dependencies', +u' * Full Unicode support', +u' * A powerful unrepr mode for storing basic datatypes'] + + utf8_paragraph_out = [line.encode('utf-8', 'replace') for line in u_paragraph_out] + + u_mixed_para = u'くらとみ kuratomi ' * 5 + utf8_mixed_para = u_mixed_para.encode('utf8') + u_mixed_para_out = [u'くらとみ kuratomi くらとみ kuratomi くらとみ kuratomi くらとみ', + u'kuratomi くらとみ kuratomi'] + u_mixed_para_57_initial_subsequent_out = [u' くらとみ kuratomi くらとみ kuratomi くらとみ kuratomi', + u'----くらとみ kuratomi くらとみ kuratomi'] + utf8_mixed_para_out = map(to_bytes, u_mixed_para_out) + utf8_mixed_para_57_initial_subsequent_out = map(to_bytes, u_mixed_para_57_initial_subsequent_out) + + u_ascii_chars = u' '.join(map(unichr, range(0, 256))) + u_ascii_no_ctrl = u''.join([c for c in u_ascii_chars if ord(c) not in misc._CONTROL_CODES]) + u_ascii_ctrl_replace = u_ascii_chars.translate(dict([(c, u'?') for c in misc._CONTROL_CODES])) + utf8_ascii_chars = u_ascii_chars.encode('utf8') diff --git a/tests/data/locale-old/pt_BR.po b/tests/data/locale-old/pt_BR.po new file mode 100644 index 0000000..7b94955 --- /dev/null +++ b/tests/data/locale-old/pt_BR.po @@ -0,0 +1,46 @@ +# Portuguese (Brazil) translations for kitchen. +# Copyright (C) 2010 ORGANIZATION +# This file is distributed under the same license as the kitchen project. +# FIRST AUTHOR , 2010. +# +msgid "" +msgstr "" +"Project-Id-Version: kitchen 0.2.1a1\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2010-09-03 00:49+0400\n" +"PO-Revision-Date: 2010-09-08 00:45-0400\n" +"Last-Translator: FULL NAME \n" +"Language-Team: pt_BR \n" +"Plural-Forms: nplurals=2; plural=(n > 1)\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.5\n" + +#: kitchen.py:1 +msgid "kitchen sink" +msgstr "placeholder" + +#: kitchen.py:2 +msgid "くらとみ" +msgstr "placeholder" + +#: kitchen.py:3 +msgid "Kuratomi" +msgstr "placeholder" + +#: kitchen.py:4 +msgid "1 lemon" +msgid_plural "4 lemons" +msgstr[0] "1 placeholder" +msgstr[1] "4 placeholders" + +#: kitchen.py:5 +msgid "一 limão" +msgid_plural "四 limões" +msgstr[0] "1 placeholder" +msgstr[1] "4 placeholders" + +#: kitchen.py:6 +msgid "Only café in fallback" +msgstr "Yes, only café in fallback" diff --git a/tests/data/locale-old/pt_BR/LC_MESSAGES/test.mo b/tests/data/locale-old/pt_BR/LC_MESSAGES/test.mo new file mode 100644 index 0000000..2f14103 Binary files /dev/null and b/tests/data/locale-old/pt_BR/LC_MESSAGES/test.mo differ diff --git a/tests/data/locale-old/test.pot b/tests/data/locale-old/test.pot new file mode 100644 index 0000000..ef73842 --- /dev/null +++ b/tests/data/locale-old/test.pot @@ -0,0 +1,46 @@ +# Translations template for kitchen. +# Copyright (C) 2010 ORGANIZATION +# This file is distributed under the same license as the kitchen project. +# FIRST AUTHOR , 2010. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: kitchen 0.2.1a1\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2010-09-03 00:49-0400\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.5\n" + +#: kitchen.py:1 +msgid "kitchen sink" +msgstr "" + +#: kitchen.py:2 +msgid "くらとみ" +msgstr "" + +#: kitchen.py:3 +msgid "Kuratomi" +msgstr "" + +#: kitchen.py:4 +msgid "1 lemon" +msgid_plural "4 lemons" +msgstr[0] "" +msgstr[1] "" + +#: kitchen.py:5 +msgid "一 limão" +msgid_plural "四 limões" +msgstr[0] "" +msgstr[1] "" + +#: kitchen.py:6 +msgid "Only café in fallback" +msgstr "" diff --git a/tests/data/locale/pt_BR.po b/tests/data/locale/pt_BR.po new file mode 100644 index 0000000..b4f46ab --- /dev/null +++ b/tests/data/locale/pt_BR.po @@ -0,0 +1,46 @@ +# Portuguese (Brazil) translations for kitchen. +# Copyright (C) 2010 ORGANIZATION +# This file is distributed under the same license as the kitchen project. +# FIRST AUTHOR , 2010. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: kitchen 0.2.1a1\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2010-09-03 00:49+0400\n" +"PO-Revision-Date: 2010-09-08 00:45-0400\n" +"Last-Translator: FULL NAME \n" +"Language-Team: pt_BR \n" +"Plural-Forms: nplurals=2; plural=(n > 1)\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.5\n" + +#: kitchen.py:1 +msgid "kitchen sink" +msgstr "pia da cozinha" + +#: kitchen.py:2 +#, fuzzy +msgid "くらとみ" +msgstr "Kuratomi" + +#: kitchen.py:3 +#, fuzzy +msgid "Kuratomi" +msgstr "くらとみ" + +#: kitchen.py:4 +msgid "1 lemon" +msgid_plural "4 lemons" +msgstr[0] "一 limão" +msgstr[1] "四 limões" + +#: kitchen.py:5 +msgid "一 limão" +msgid_plural "四 limões" +msgstr[0] "1 lemon" +msgstr[1] "4 lemons" + diff --git a/tests/data/locale/pt_BR/LC_MESSAGES/test.mo b/tests/data/locale/pt_BR/LC_MESSAGES/test.mo new file mode 100644 index 0000000..36af5be Binary files /dev/null and b/tests/data/locale/pt_BR/LC_MESSAGES/test.mo differ diff --git a/tests/data/locale/test.pot b/tests/data/locale/test.pot new file mode 100644 index 0000000..137bcfd --- /dev/null +++ b/tests/data/locale/test.pot @@ -0,0 +1,42 @@ +# Translations template for kitchen. +# Copyright (C) 2010 ORGANIZATION +# This file is distributed under the same license as the kitchen project. +# FIRST AUTHOR , 2010. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: kitchen 0.2.1a1\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2010-09-03 00:49-0400\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 0.9.5\n" + +#: kitchen.py:1 +msgid "kitchen sink" +msgstr "" + +#: kitchen.py:2 +msgid "くらとみ" +msgstr "" + +#: kitchen.py:3 +msgid "Kuratomi" +msgstr "" + +#: kitchen.py:4 +msgid "1 lemon" +msgid_plural "4 lemons" +msgstr[0] "" +msgstr[1] "" + +#: kitchen.py:5 +msgid "一 limão" +msgid_plural "四 limões" +msgstr[0] "" +msgstr[1] "" diff --git a/tests/subprocessdata/sigchild_ignore.py b/tests/subprocessdata/sigchild_ignore.py new file mode 100644 index 0000000..5b6dd08 --- /dev/null +++ b/tests/subprocessdata/sigchild_ignore.py @@ -0,0 +1,11 @@ +import os +import signal, sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + +from kitchen.pycompat27.subprocess import _subprocess as subprocess + +# On Linux this causes os.waitpid to fail with OSError as the OS has already +# reaped our child process. The wait() passing the OSError on to the caller +# and causing us to exit with an error is what we are testing against. +signal.signal(signal.SIGCHLD, signal.SIG_IGN) +subprocess.Popen([sys.executable, '-c', 'print("albatross")']).wait() diff --git a/tests/test__all__.py b/tests/test__all__.py new file mode 100644 index 0000000..9f58f7a --- /dev/null +++ b/tests/test__all__.py @@ -0,0 +1,193 @@ +# -*- coding: utf-8 -*- +from nose import tools + +import os +import types +import warnings +from kitchen.pycompat24.sets import add_builtin_set +add_builtin_set() + +def logit(msg): + log = open('/var/tmp/test.log', 'a') + log.write('%s\n' % msg) + log.close() + +class NoAll(RuntimeError): + pass + +class FailedImport(RuntimeError): + pass + +class Test__all__(object): + '''Test that every function in __all__ exists and that no public methods + are missing from __all__ + ''' + known_private = set([('kitchen', 'collections', 'version_tuple_to_string'), + ('kitchen.collections', 'strictdict', 'defaultdict'), + ('kitchen', 'i18n', 'version_tuple_to_string'), + ('kitchen', 'i18n', 'to_bytes'), + ('kitchen', 'i18n', 'to_unicode'), + ('kitchen', 'i18n', 'ENOENT'), + ('kitchen', 'i18n', 'byte_string_valid_encoding'), + ('kitchen', 'iterutils', 'version_tuple_to_string'), + ('kitchen', 'pycompat24', 'version_tuple_to_string'), + ('kitchen', 'pycompat25', 'version_tuple_to_string'), + ('kitchen.pycompat25.collections', '_defaultdict', 'b_'), + ('kitchen', 'pycompat27', 'version_tuple_to_string'), + ('kitchen.pycompat27', 'subprocess', 'MAXFD'), + ('kitchen.pycompat27', 'subprocess', 'list2cmdline'), + ('kitchen.pycompat27', 'subprocess', 'mswindows'), + ('kitchen', 'text', 'version_tuple_to_string'), + ('kitchen.text', 'converters', 'b_'), + ('kitchen.text', 'converters', 'b64decode'), + ('kitchen.text', 'converters', 'b64encode'), + ('kitchen.text', 'converters', 'ControlCharError'), + ('kitchen.text', 'converters', 'guess_encoding'), + ('kitchen.text', 'converters', 'html_entities_unescape'), + ('kitchen.text', 'converters', 'process_control_chars'), + ('kitchen.text', 'converters', 'XmlEncodeError'), + ('kitchen.text', 'misc', 'b_'), + ('kitchen.text', 'misc', 'chardet'), + ('kitchen.text', 'misc', 'ControlCharError'), + ('kitchen.text', 'display', 'b_'), + ('kitchen.text', 'display', 'ControlCharError'), + ('kitchen.text', 'display', 'to_bytes'), + ('kitchen.text', 'display', 'to_unicode'), + ('kitchen.text', 'utf8', 'b_'), + ('kitchen.text', 'utf8', 'byte_string_textual_width_fill'), + ('kitchen.text', 'utf8', 'byte_string_valid_encoding'), + ('kitchen.text', 'utf8', 'fill'), + ('kitchen.text', 'utf8', 'textual_width'), + ('kitchen.text', 'utf8', 'textual_width_chop'), + ('kitchen.text', 'utf8', 'to_bytes'), + ('kitchen.text', 'utf8', 'to_unicode'), + ('kitchen.text', 'utf8', 'wrap'), + ]) + lib_dir = os.path.join(os.path.dirname(__file__), '..', 'kitchen') + + def setUp(self): + # Silence deprecation warnings + warnings.simplefilter('ignore', DeprecationWarning) + def tearDown(self): + warnings.simplefilter('default', DeprecationWarning) + + def walk_modules(self, basedir, modpath): + files = os.listdir(basedir) + files.sort() + for fn in files: + path = os.path.join(basedir, fn) + if os.path.isdir(path): + pkg_init = os.path.join(path, '__init__.py') + if os.path.exists(pkg_init): + yield pkg_init, modpath + fn + for p, m in self.walk_modules(path, modpath + fn + '.'): + yield p, m + continue + if not fn.endswith('.py') or fn == '__init__.py': + continue + yield path, modpath + fn[:-3] + + def check_has__all__(self, modpath): + # This heuristic speeds up the process by removing, de facto, + # most test modules (and avoiding the auto-executing ones). + f = None + try: + try: + f = open(modpath, 'rb') + tools.ok_('__all__' in f.read(), '%s does not contain __all__' % modpath) + except IOError, e: + tools.ok_(False, '%s' % e) + finally: + if f: + f.close() + + def test_has__all__(self): + ''' + For each module, check that it has an __all__ + ''' + # Blacklisted modules and packages + blacklist = set([ ]) + + for path, modname in [m for m in self.walk_modules(self.lib_dir, '') + if m[1] not in blacklist]: + # Check that it has an __all__ + yield self.check_has__all__, path + + def check_everything_in__all__exists(self, modname, modpath): + names = {} + exec 'from %s import %s' % (modpath, modname) in names + if not hasattr(names[modname], '__all__'): + # This should have been reported by test_has__all__ + return + + interior_names = {} + try: + exec 'from %s.%s import *' % (modpath, modname) in interior_names + except Exception, e: + # Include the module name in the exception string + tools.ok_(False, '__all__ failure in %s: %s: %s' % ( + modname, e.__class__.__name__, e)) + if '__builtins__' in interior_names: + del interior_names['__builtins__'] + keys = set(interior_names) + all = set(names[modname].__all__) + tools.ok_(keys == all) + + def test_everything_in__all__exists(self): + ''' + For each name in module's __all__, check that it exists + ''' + # Blacklisted modules and packages + blacklist = set([ ]) + + for path, modname in [m for m in self.walk_modules(self.lib_dir, '') + if m[1] not in blacklist]: + # From path, deduce the module name + from_name = path[path.find('../kitchen') + 3:] + if from_name.endswith('__init__.py'): + # Remove __init__.py as well as the filename + from_name = os.path.dirname(from_name) + from_name = os.path.dirname(from_name) + from_name = unicode(from_name, 'utf-8') + from_name = from_name.translate({ord(u'/'): u'.'}) + from_name = from_name.encode('utf-8') + yield self.check_everything_in__all__exists, modname.split('.')[-1], from_name + + + def check__all__is_complete(self, modname, modpath): + names = {} + exec 'from %s import %s' % (modpath, modname) in names + if not hasattr(names[modname], '__all__'): + # This should have been reported by test_has__all__ + return + + mod = names[modname] + expected_public = [k for k in mod.__dict__ if (modpath, modname, k) + not in self.known_private and not k.startswith("_") and not + isinstance(mod.__dict__[k], types.ModuleType)] + + all = set(mod.__all__) + public = set(expected_public) + tools.ok_(all.issuperset(public), 'These public names are not in %s.__all__: %s' + % (modname, ', '.join(public.difference(all)))) + + def test__all__is_complete(self): + ''' + For each module, check that every public name is in __all__ + ''' + # Blacklisted modules and packages + blacklist = set(['pycompat27.subprocess._subprocess', + 'pycompat24.base64._base64']) + + for path, modname in [m for m in self.walk_modules(self.lib_dir, '') + if m[1] not in blacklist]: + # From path, deduce the module name + from_name = path[path.find('../kitchen') + 3:] + if from_name.endswith('__init__.py'): + # Remove __init__.py as well as the filename + from_name = os.path.dirname(from_name) + from_name = os.path.dirname(from_name) + from_name = unicode(from_name, 'utf-8') + from_name = from_name.translate({ord(u'/'): u'.'}) + from_name = from_name.encode('utf-8') + yield self.check__all__is_complete, modname.split('.')[-1], from_name diff --git a/tests/test_base64.py b/tests/test_base64.py new file mode 100644 index 0000000..bdb388d --- /dev/null +++ b/tests/test_base64.py @@ -0,0 +1,190 @@ +import unittest +from test import test_support +from kitchen.pycompat24.base64 import _base64 as base64 + + + +class LegacyBase64TestCase(unittest.TestCase): + def test_encodestring(self): + eq = self.assertEqual + eq(base64.encodestring("www.python.org"), "d3d3LnB5dGhvbi5vcmc=\n") + eq(base64.encodestring("a"), "YQ==\n") + eq(base64.encodestring("ab"), "YWI=\n") + eq(base64.encodestring("abc"), "YWJj\n") + eq(base64.encodestring(""), "") + eq(base64.encodestring("abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789!@#0^&*();:<>,. []{}"), + "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE" + "RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0\nNT" + "Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ==\n") + + def test_decodestring(self): + eq = self.assertEqual + eq(base64.decodestring("d3d3LnB5dGhvbi5vcmc=\n"), "www.python.org") + eq(base64.decodestring("YQ==\n"), "a") + eq(base64.decodestring("YWI=\n"), "ab") + eq(base64.decodestring("YWJj\n"), "abc") + eq(base64.decodestring("YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE" + "RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0\nNT" + "Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ==\n"), + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789!@#0^&*();:<>,. []{}") + eq(base64.decodestring(''), '') + + def test_encode(self): + eq = self.assertEqual + from cStringIO import StringIO + infp = StringIO('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789!@#0^&*();:<>,. []{}') + outfp = StringIO() + base64.encode(infp, outfp) + eq(outfp.getvalue(), + 'YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE' + 'RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0\nNT' + 'Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ==\n') + + def test_decode(self): + from cStringIO import StringIO + infp = StringIO('d3d3LnB5dGhvbi5vcmc=') + outfp = StringIO() + base64.decode(infp, outfp) + self.assertEqual(outfp.getvalue(), 'www.python.org') + + + +class BaseXYTestCase(unittest.TestCase): + def test_b64encode(self): + eq = self.assertEqual + # Test default alphabet + eq(base64.b64encode("www.python.org"), "d3d3LnB5dGhvbi5vcmc=") + eq(base64.b64encode('\x00'), 'AA==') + eq(base64.b64encode("a"), "YQ==") + eq(base64.b64encode("ab"), "YWI=") + eq(base64.b64encode("abc"), "YWJj") + eq(base64.b64encode(""), "") + eq(base64.b64encode("abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789!@#0^&*();:<>,. []{}"), + "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE" + "RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0NT" + "Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ==") + # Test with arbitrary alternative characters + eq(base64.b64encode('\xd3V\xbeo\xf7\x1d', altchars='*$'), '01a*b$cd') + # Test standard alphabet + eq(base64.standard_b64encode("www.python.org"), "d3d3LnB5dGhvbi5vcmc=") + eq(base64.standard_b64encode("a"), "YQ==") + eq(base64.standard_b64encode("ab"), "YWI=") + eq(base64.standard_b64encode("abc"), "YWJj") + eq(base64.standard_b64encode(""), "") + eq(base64.standard_b64encode("abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789!@#0^&*();:<>,. []{}"), + "YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE" + "RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0NT" + "Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ==") + # Test with 'URL safe' alternative characters + eq(base64.urlsafe_b64encode('\xd3V\xbeo\xf7\x1d'), '01a-b_cd') + + def test_b64decode(self): + eq = self.assertEqual + eq(base64.b64decode("d3d3LnB5dGhvbi5vcmc="), "www.python.org") + eq(base64.b64decode('AA=='), '\x00') + eq(base64.b64decode("YQ=="), "a") + eq(base64.b64decode("YWI="), "ab") + eq(base64.b64decode("YWJj"), "abc") + eq(base64.b64decode("YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE" + "RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0\nNT" + "Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ=="), + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789!@#0^&*();:<>,. []{}") + eq(base64.b64decode(''), '') + # Test with arbitrary alternative characters + eq(base64.b64decode('01a*b$cd', altchars='*$'), '\xd3V\xbeo\xf7\x1d') + # Test standard alphabet + eq(base64.standard_b64decode("d3d3LnB5dGhvbi5vcmc="), "www.python.org") + eq(base64.standard_b64decode("YQ=="), "a") + eq(base64.standard_b64decode("YWI="), "ab") + eq(base64.standard_b64decode("YWJj"), "abc") + eq(base64.standard_b64decode(""), "") + eq(base64.standard_b64decode("YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXpBQkNE" + "RUZHSElKS0xNTk9QUVJTVFVWV1hZWjAxMjM0NT" + "Y3ODkhQCMwXiYqKCk7Ojw+LC4gW117fQ=="), + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789!@#0^&*();:<>,. []{}") + # Test with 'URL safe' alternative characters + eq(base64.urlsafe_b64decode('01a-b_cd'), '\xd3V\xbeo\xf7\x1d') + + def test_b64decode_error(self): + self.assertRaises(TypeError, base64.b64decode, 'abc') + + def test_b32encode(self): + eq = self.assertEqual + eq(base64.b32encode(''), '') + eq(base64.b32encode('\x00'), 'AA======') + eq(base64.b32encode('a'), 'ME======') + eq(base64.b32encode('ab'), 'MFRA====') + eq(base64.b32encode('abc'), 'MFRGG===') + eq(base64.b32encode('abcd'), 'MFRGGZA=') + eq(base64.b32encode('abcde'), 'MFRGGZDF') + + def test_b32decode(self): + eq = self.assertEqual + eq(base64.b32decode(''), '') + eq(base64.b32decode('AA======'), '\x00') + eq(base64.b32decode('ME======'), 'a') + eq(base64.b32decode('MFRA===='), 'ab') + eq(base64.b32decode('MFRGG==='), 'abc') + eq(base64.b32decode('MFRGGZA='), 'abcd') + eq(base64.b32decode('MFRGGZDF'), 'abcde') + + def test_b32decode_casefold(self): + eq = self.assertEqual + eq(base64.b32decode('', True), '') + eq(base64.b32decode('ME======', True), 'a') + eq(base64.b32decode('MFRA====', True), 'ab') + eq(base64.b32decode('MFRGG===', True), 'abc') + eq(base64.b32decode('MFRGGZA=', True), 'abcd') + eq(base64.b32decode('MFRGGZDF', True), 'abcde') + # Lower cases + eq(base64.b32decode('me======', True), 'a') + eq(base64.b32decode('mfra====', True), 'ab') + eq(base64.b32decode('mfrgg===', True), 'abc') + eq(base64.b32decode('mfrggza=', True), 'abcd') + eq(base64.b32decode('mfrggzdf', True), 'abcde') + # Expected exceptions + self.assertRaises(TypeError, base64.b32decode, 'me======') + # Mapping zero and one + eq(base64.b32decode('MLO23456'), 'b\xdd\xad\xf3\xbe') + eq(base64.b32decode('M1023456', map01='L'), 'b\xdd\xad\xf3\xbe') + eq(base64.b32decode('M1023456', map01='I'), 'b\x1d\xad\xf3\xbe') + + def test_b32decode_error(self): + self.assertRaises(TypeError, base64.b32decode, 'abc') + self.assertRaises(TypeError, base64.b32decode, 'ABCDEF==') + + def test_b16encode(self): + eq = self.assertEqual + eq(base64.b16encode('\x01\x02\xab\xcd\xef'), '0102ABCDEF') + eq(base64.b16encode('\x00'), '00') + + def test_b16decode(self): + eq = self.assertEqual + eq(base64.b16decode('0102ABCDEF'), '\x01\x02\xab\xcd\xef') + eq(base64.b16decode('00'), '\x00') + # Lower case is not allowed without a flag + self.assertRaises(TypeError, base64.b16decode, '0102abcdef') + # Case fold + eq(base64.b16decode('0102abcdef', True), '\x01\x02\xab\xcd\xef') + + + +#def test_main(): +# test_support.run_unittest(__name__) +# +#if __name__ == '__main__': +# test_main() diff --git a/tests/test_collections.py b/tests/test_collections.py new file mode 100644 index 0000000..ec6206e --- /dev/null +++ b/tests/test_collections.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +from kitchen.pycompat24.sets import add_builtin_set +add_builtin_set() + +from kitchen import collections + +def test_strict_dict_get_set(): + '''Test getting and setting items in StrictDict''' + d = collections.StrictDict() + d[u'a'] = 1 + d['a'] = 2 + tools.ok_(d[u'a'] != d['a']) + tools.ok_(len(d) == 2) + + d[u'\xf1'] = 1 + d['\xf1'] = 2 + d[u'\xf1'.encode('utf8')] = 3 + tools.ok_(d[u'\xf1'] == 1) + tools.ok_(d['\xf1'] == 2) + tools.ok_(d[u'\xf1'.encode('utf8')] == 3) + tools.ok_(len(d) == 5) + +class TestStrictDict(unittest.TestCase): + def setUp(self): + self.d = collections.StrictDict() + self.d[u'a'] = 1 + self.d['a'] = 2 + self.d[u'\xf1'] = 1 + self.d['\xf1'] = 2 + self.d[u'\xf1'.encode('utf8')] = 3 + self.keys = [u'a', 'a', u'\xf1', '\xf1', u'\xf1'.encode('utf8')] + + def tearDown(self): + del(self.d) + + def _compare_lists(self, list1, list2, debug=False): + '''We have a mixture of bytes and unicode and need python2.3 compat + + So we have to compare these lists manually and inefficiently + ''' + def _compare_lists_helper(compare_to, dupes, idx, length): + if i not in compare_to: + return False + for n in range(1, length + 1): + if i not in dupes[n][idx]: + dupes[n][idx].add(i) + return True + if len(list1) != len(list2): + return False + + list1_dupes = dict([(i, (set(), set(), set())) for i in range(1, len(list1)+1)]) + list2_dupes = dict([(i, (set(), set(), set())) for i in range(1, len(list1)+1)]) + + list1_u = [l for l in list1 if isinstance(l, unicode)] + list1_b = [l for l in list1 if isinstance(l, str)] + list1_o = [l for l in list1 if not (isinstance(l, unicode) or isinstance(l, str))] + + list2_u = [l for l in list2 if isinstance(l, unicode)] + list2_b = [l for l in list2 if isinstance(l, str)] + list2_o = [l for l in list2 if not (isinstance(l, unicode) or isinstance(l, str))] + + for i in list1: + if isinstance(i, unicode): + if not _compare_lists_helper(list2_u, list1_dupes, 0, len(list1)): + return False + elif isinstance(i, str): + if not _compare_lists_helper(list2_b, list1_dupes, 1, len(list1)): + return False + else: + if not _compare_lists_helper(list2_o, list1_dupes, 2, len(list1)): + return False + + if list1_dupes[2][0] or list1_dupes[2][1] or list1_dupes[2][2]: + for i in list2: + if isinstance(i, unicode): + if not _compare_lists_helper(list1_u, list2_dupes, 0, len(list1)): + return False + elif isinstance(i, str): + if not _compare_lists_helper(list1_b, list2_dupes, 1, len(list1)): + return False + else: + if not _compare_lists_helper(list1_o, list2_dupes, 2, len(list1)): + return False + + for i in range(2, len(list1)+1): + for n in list1_dupes[i]: + if n not in list2_dupes[i]: + return False + + return True + + def test__compare_list(self): + '''*sigh* this test support function is so complex we need to test it''' + tools.ok_(self._compare_lists(['a', 'b', 'c'], ['c', 'a', 'b'])) + tools.ok_(not self._compare_lists(['b', 'c'], ['c', 'a', 'b'])) + tools.ok_(not self._compare_lists([u'a', 'b'], ['a', 'b'])) + tools.ok_(not self._compare_lists(['a', u'b'], [u'a', 'b'])) + tools.ok_(self._compare_lists(['a', 'b', 1], ['a', 1, 'b'])) + tools.ok_(self._compare_lists([u'a', u'b'], [u'a', u'b'])) + tools.ok_(self._compare_lists([u'a', 'b'], [u'a', 'b'])) + tools.ok_(not self._compare_lists([u'a', 'b'], [u'a', u'b'])) + tools.ok_(self._compare_lists([u'a', 'b', 'b', 'c', u'a'], [u'a', u'a', 'b', 'c', 'b'])) + tools.ok_(not self._compare_lists([u'a', 'b', 'b', 'c', 'a'], [u'a', u'a', 'b', 'c', 'b'])) + tools.ok_(not self._compare_lists([u'a', 'b', 'b', 'c', u'a'], [u'a', 'b', 'b', 'c', 'b'])) + + def test_strict_dict_len(self): + '''StrictDict len''' + tools.ok_(len(self.d) == 5) + + def test_strict_dict_del(self): + '''StrictDict del''' + tools.ok_(len(self.d) == 5) + del(self.d[u'\xf1']) + tools.assert_raises(KeyError, self.d.__getitem__, u'\xf1') + tools.ok_(len(self.d) == 4) + + def test_strict_dict_iter(self): + '''StrictDict iteration''' + keys = [] + for k in self.d: + keys.append(k) + tools.ok_(self._compare_lists(keys, self.keys)) + + keys = [] + for k in self.d.iterkeys(): + keys.append(k) + tools.ok_(self._compare_lists(keys, self.keys)) + + keys = [k for k in self.d] + tools.ok_(self._compare_lists(keys, self.keys)) + + keys = [] + for k in self.d.keys(): + keys.append(k) + tools.ok_(self._compare_lists(keys, self.keys)) + + def test_strict_dict_contains(self): + '''StrictDict contains function''' + tools.ok_('b' not in self.d) + tools.ok_(u'b' not in self.d) + tools.ok_('\xf1' in self.d) + tools.ok_(u'\xf1' in self.d) + tools.ok_('a' in self.d) + tools.ok_(u'a' in self.d) + + del(self.d[u'\xf1']) + tools.ok_(u'\xf1' not in self.d) + tools.ok_('\xf1' in self.d) + + del(self.d['a']) + tools.ok_(u'a' in self.d) + tools.ok_('a' not in self.d) diff --git a/tests/test_converters.py b/tests/test_converters.py new file mode 100644 index 0000000..9f880d4 --- /dev/null +++ b/tests/test_converters.py @@ -0,0 +1,387 @@ +# -*- coding: utf-8 -*- +# + +import unittest +from nose import tools +from nose.plugins.skip import SkipTest + +import StringIO +import warnings + +try: + import chardet +except: + chardet = None + +from kitchen.text import converters +from kitchen.text.exceptions import XmlEncodeError + +import base_classes + +class UnicodeNoStr(object): + def __unicode__(self): + return u'El veloz murciélago saltó sobre el perro perezoso.' + +class StrNoUnicode(object): + def __str__(self): + return u'El veloz murciélago saltó sobre el perro perezoso.'.encode('utf8') + +class StrReturnsUnicode(object): + def __str__(self): + return u'El veloz murciélago saltó sobre el perro perezoso.' + +class UnicodeReturnsStr(object): + def __unicode__(self): + return u'El veloz murciélago saltó sobre el perro perezoso.'.encode('utf8') + +class UnicodeStrCrossed(object): + def __unicode__(self): + return u'El veloz murciélago saltó sobre el perro perezoso.'.encode('utf8') + + def __str__(self): + return u'El veloz murciélago saltó sobre el perro perezoso.' + +class ReprUnicode(object): + def __repr__(self): + return u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)' + +class TestConverters(unittest.TestCase, base_classes.UnicodeTestData): + def test_to_unicode(self): + '''Test to_unicode when the user gives good values''' + tools.ok_(converters.to_unicode(self.u_japanese, encoding='latin1') == self.u_japanese) + + tools.ok_(converters.to_unicode(self.utf8_spanish) == self.u_spanish) + tools.ok_(converters.to_unicode(self.utf8_japanese) == self.u_japanese) + + tools.ok_(converters.to_unicode(self.latin1_spanish, encoding='latin1') == self.u_spanish) + tools.ok_(converters.to_unicode(self.euc_jp_japanese, encoding='euc_jp') == self.u_japanese) + + tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'nonstring': 'foo'}) + + def test_to_unicode_errors(self): + tools.ok_(converters.to_unicode(self.latin1_spanish) == self.u_mangled_spanish_latin1_as_utf8) + tools.ok_(converters.to_unicode(self.latin1_spanish, errors='ignore') == self.u_spanish_ignore) + tools.assert_raises(UnicodeDecodeError, converters.to_unicode, + *[self.latin1_spanish], **{'errors': 'strict'}) + + def test_to_unicode_nonstring(self): + tools.ok_(converters.to_unicode(5) == u'5') + tools.ok_(converters.to_unicode(5, nonstring='empty') == u'') + tools.ok_(converters.to_unicode(5, nonstring='passthru') == 5) + tools.ok_(converters.to_unicode(5, nonstring='simplerepr') == u'5') + tools.ok_(converters.to_unicode(5, nonstring='repr') == u'5') + tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'nonstring': 'strict'}) + + tools.ok_(converters.to_unicode(UnicodeNoStr(), nonstring='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(StrNoUnicode(), nonstring='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(StrReturnsUnicode(), nonstring='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(UnicodeReturnsStr(), nonstring='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(UnicodeStrCrossed(), nonstring='simplerepr') == self.u_spanish) + + obj_repr = converters.to_unicode(object, nonstring='simplerepr') + tools.ok_(obj_repr == u"" and isinstance(obj_repr, unicode)) + + def test_to_bytes(self): + '''Test to_bytes when the user gives good values''' + tools.ok_(converters.to_bytes(self.utf8_japanese, encoding='latin1') == self.utf8_japanese) + + tools.ok_(converters.to_bytes(self.u_spanish) == self.utf8_spanish) + tools.ok_(converters.to_bytes(self.u_japanese) == self.utf8_japanese) + + tools.ok_(converters.to_bytes(self.u_spanish, encoding='latin1') == self.latin1_spanish) + tools.ok_(converters.to_bytes(self.u_japanese, encoding='euc_jp') == self.euc_jp_japanese) + + def test_to_bytes_errors(self): + tools.ok_(converters.to_bytes(self.u_mixed, encoding='latin1') == + self.latin1_mixed_replace) + tools.ok_(converters.to_bytes(self.u_mixed, encoding='latin', + errors='ignore') == self.latin1_mixed_ignore) + tools.assert_raises(UnicodeEncodeError, converters.to_bytes, + *[self.u_mixed], **{'errors': 'strict', 'encoding': 'latin1'}) + + def _check_repr_bytes(self, repr_string, obj_name): + tools.ok_(isinstance(repr_string, str)) + match = self.repr_re.match(repr_string) + tools.ok_(match != None) + tools.ok_(match.groups()[0] == obj_name) + + def test_to_bytes_nonstring(self): + tools.ok_(converters.to_bytes(5) == '5') + tools.ok_(converters.to_bytes(5, nonstring='empty') == '') + tools.ok_(converters.to_bytes(5, nonstring='passthru') == 5) + tools.ok_(converters.to_bytes(5, nonstring='simplerepr') == '5') + tools.ok_(converters.to_bytes(5, nonstring='repr') == '5') + + # Raise a TypeError if the msg is nonstring and we're set to strict + tools.assert_raises(TypeError, converters.to_bytes, *[5], **{'nonstring': 'strict'}) + # Raise a TypeError if given an invalid nonstring arg + tools.assert_raises(TypeError, converters.to_bytes, *[5], **{'nonstring': 'INVALID'}) + + # No __str__ method so this returns repr + string = converters.to_bytes(UnicodeNoStr(), nonstring='simplerepr') + self._check_repr_bytes(string, 'UnicodeNoStr') + + # This object's _str__ returns a utf8 encoded object + tools.ok_(converters.to_bytes(StrNoUnicode(), nonstring='simplerepr') == self.utf8_spanish) + + # This object's __str__ returns unicode which to_bytes converts to utf8 + tools.ok_(converters.to_bytes(StrReturnsUnicode(), nonstring='simplerepr') == self.utf8_spanish) + # Unless we explicitly ask for something different + tools.ok_(converters.to_bytes(StrReturnsUnicode(), + nonstring='simplerepr', encoding='latin1') == self.latin1_spanish) + + # This object has no __str__ so it returns repr + string = converters.to_bytes(UnicodeReturnsStr(), nonstring='simplerepr') + self._check_repr_bytes(string, 'UnicodeReturnsStr') + + # This object's __str__ returns unicode which to_bytes converts to utf8 + tools.ok_(converters.to_bytes(UnicodeStrCrossed(), nonstring='simplerepr') == self.utf8_spanish) + + # This object's __repr__ returns unicode which to_bytes converts to utf8 + tools.ok_(converters.to_bytes(ReprUnicode(), nonstring='simplerepr') + == u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)'.encode('utf8')) + tools.ok_(converters.to_bytes(ReprUnicode(), nonstring='repr') == + u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)'.encode('utf8')) + + obj_repr = converters.to_bytes(object, nonstring='simplerepr') + tools.ok_(obj_repr == "" and isinstance(obj_repr, str)) + + def test_unicode_to_xml(self): + tools.ok_(converters.unicode_to_xml(None) == '') + tools.assert_raises(XmlEncodeError, converters.unicode_to_xml, *['byte string']) + tools.assert_raises(ValueError, converters.unicode_to_xml, *[u'string'], **{'control_chars': 'foo'}) + tools.assert_raises(XmlEncodeError, converters.unicode_to_xml, + *[u'string\u0002'], **{'control_chars': 'strict'}) + tools.ok_(converters.unicode_to_xml(self.u_entity) == self.utf8_entity_escape) + tools.ok_(converters.unicode_to_xml(self.u_entity, attrib=True) == self.utf8_attrib_escape) + + def test_xml_to_unicode(self): + tools.ok_(converters.xml_to_unicode(self.utf8_entity_escape, 'utf8', 'replace') == self.u_entity) + tools.ok_(converters.xml_to_unicode(self.utf8_attrib_escape, 'utf8', 'replace') == self.u_entity) + + def test_xml_to_byte_string(self): + tools.ok_(converters.xml_to_byte_string(self.utf8_entity_escape, 'utf8', 'replace') == self.u_entity.encode('utf8')) + tools.ok_(converters.xml_to_byte_string(self.utf8_attrib_escape, 'utf8', 'replace') == self.u_entity.encode('utf8')) + + tools.ok_(converters.xml_to_byte_string(self.utf8_attrib_escape, + output_encoding='euc_jp', errors='replace') == + self.u_entity.encode('euc_jp', 'replace')) + tools.ok_(converters.xml_to_byte_string(self.utf8_attrib_escape, + output_encoding='latin1', errors='replace') == + self.u_entity.encode('latin1', 'replace')) + + def test_byte_string_to_xml(self): + tools.assert_raises(XmlEncodeError, converters.byte_string_to_xml, *[u'test']) + tools.ok_(converters.byte_string_to_xml(self.utf8_entity) == self.utf8_entity_escape) + tools.ok_(converters.byte_string_to_xml(self.utf8_entity, attrib=True) == self.utf8_attrib_escape) + + def test_bytes_to_xml(self): + tools.ok_(converters.bytes_to_xml(self.b_byte_chars) == self.b_byte_encoded) + + def test_xml_to_bytes(self): + tools.ok_(converters.xml_to_bytes(self.b_byte_encoded) == self.b_byte_chars) + + def test_guess_encoding_to_xml(self): + tools.ok_(converters.guess_encoding_to_xml(self.u_entity) == self.utf8_entity_escape) + tools.ok_(converters.guess_encoding_to_xml(self.utf8_spanish) == self.utf8_spanish) + tools.ok_(converters.guess_encoding_to_xml(self.latin1_spanish) == self.utf8_spanish) + tools.ok_(converters.guess_encoding_to_xml(self.utf8_japanese) == self.utf8_japanese) + + def test_guess_encoding_to_xml_euc_japanese(self): + if chardet: + tools.ok_(converters.guess_encoding_to_xml(self.euc_jp_japanese) + == self.utf8_japanese) + else: + raise SkipTest('chardet not installed, euc_japanese won\'t be detected') + + def test_guess_encoding_to_xml_euc_japanese_mangled(self): + if chardet: + raise SkipTest('chardet installed, euc_japanese won\'t be mangled') + else: + tools.ok_(converters.guess_encoding_to_xml(self.euc_jp_japanese) + == self.utf8_mangled_euc_jp_as_latin1) + +class TestGetWriter(unittest.TestCase, base_classes.UnicodeTestData): + def setUp(self): + self.io = StringIO.StringIO() + + def test_utf8_writer(self): + writer = converters.getwriter('utf-8') + io = writer(self.io) + io.write(u'%s\n' % self.u_japanese) + io.seek(0) + result = io.read().strip() + tools.ok_(result == self.utf8_japanese) + + io.seek(0) + io.truncate(0) + io.write('%s\n' % self.euc_jp_japanese) + io.seek(0) + result = io.read().strip() + tools.ok_(result == self.euc_jp_japanese) + + io.seek(0) + io.truncate(0) + io.write('%s\n' % self.utf8_japanese) + io.seek(0) + result = io.read().strip() + tools.ok_(result == self.utf8_japanese) + + def test_error_handlers(self): + '''Test setting alternate error handlers''' + writer = converters.getwriter('latin1') + io = writer(self.io, errors='strict') + tools.assert_raises(UnicodeEncodeError, io.write, self.u_japanese) + + +class TestExceptionConverters(unittest.TestCase, base_classes.UnicodeTestData): + def setUp(self): + self.exceptions = {} + tests = {'u_jpn': self.u_japanese, + 'u_spanish': self.u_spanish, + 'utf8_jpn': self.utf8_japanese, + 'utf8_spanish': self.utf8_spanish, + 'euc_jpn': self.euc_jp_japanese, + 'latin1_spanish': self.latin1_spanish} + for test in tests.iteritems(): + try: + raise Exception(test[1]) + except Exception, self.exceptions[test[0]]: + pass + + def test_exception_to_unicode_with_unicode(self): + tools.ok_(converters.exception_to_unicode(self.exceptions['u_jpn']) == self.u_japanese) + tools.ok_(converters.exception_to_unicode(self.exceptions['u_spanish']) == self.u_spanish) + + def test_exception_to_unicode_with_bytes(self): + tools.ok_(converters.exception_to_unicode(self.exceptions['utf8_jpn']) == self.u_japanese) + tools.ok_(converters.exception_to_unicode(self.exceptions['utf8_spanish']) == self.u_spanish) + # Mangled latin1/utf8 conversion but no tracebacks + tools.ok_(converters.exception_to_unicode(self.exceptions['latin1_spanish']) == self.u_mangled_spanish_latin1_as_utf8) + # Mangled euc_jp/utf8 conversion but no tracebacks + tools.ok_(converters.exception_to_unicode(self.exceptions['euc_jpn']) == self.u_mangled_euc_jp_as_utf8) + + def test_exception_to_unicode_custom(self): + # If given custom functions, then we should not mangle + c = [lambda e: converters.to_unicode(e, encoding='euc_jp')] + tools.ok_(converters.exception_to_unicode(self.exceptions['euc_jpn'], + converters=c) == self.u_japanese) + c.extend(converters.EXCEPTION_CONVERTERS) + tools.ok_(converters.exception_to_unicode(self.exceptions['euc_jpn'], + converters=c) == self.u_japanese) + + c = [lambda e: converters.to_unicode(e, encoding='latin1')] + tools.ok_(converters.exception_to_unicode(self.exceptions['latin1_spanish'], + converters=c) == self.u_spanish) + c.extend(converters.EXCEPTION_CONVERTERS) + tools.ok_(converters.exception_to_unicode(self.exceptions['latin1_spanish'], + converters=c) == self.u_spanish) + + def test_exception_to_bytes_with_unicode(self): + tools.ok_(converters.exception_to_bytes(self.exceptions['u_jpn']) == self.utf8_japanese) + tools.ok_(converters.exception_to_bytes(self.exceptions['u_spanish']) == self.utf8_spanish) + + def test_exception_to_bytes_with_bytes(self): + tools.ok_(converters.exception_to_bytes(self.exceptions['utf8_jpn']) == self.utf8_japanese) + tools.ok_(converters.exception_to_bytes(self.exceptions['utf8_spanish']) == self.utf8_spanish) + tools.ok_(converters.exception_to_bytes(self.exceptions['latin1_spanish']) == self.latin1_spanish) + tools.ok_(converters.exception_to_bytes(self.exceptions['euc_jpn']) == self.euc_jp_japanese) + + def test_exception_to_bytes_custom(self): + # If given custom functions, then we should not mangle + c = [lambda e: converters.to_bytes(e, encoding='euc_jp')] + tools.ok_(converters.exception_to_bytes(self.exceptions['euc_jpn'], + converters=c) == self.euc_jp_japanese) + c.extend(converters.EXCEPTION_CONVERTERS) + tools.ok_(converters.exception_to_bytes(self.exceptions['euc_jpn'], + converters=c) == self.euc_jp_japanese) + + c = [lambda e: converters.to_bytes(e, encoding='latin1')] + tools.ok_(converters.exception_to_bytes(self.exceptions['latin1_spanish'], + converters=c) == self.latin1_spanish) + c.extend(converters.EXCEPTION_CONVERTERS) + tools.ok_(converters.exception_to_bytes(self.exceptions['latin1_spanish'], + converters=c) == self.latin1_spanish) + + +class TestDeprecatedConverters(TestConverters): + def setUp(self): + warnings.simplefilter('ignore', DeprecationWarning) + + def tearDown(self): + warnings.simplefilter('default', DeprecationWarning) + + def test_to_xml(self): + tools.ok_(converters.to_xml(self.u_entity) == self.utf8_entity_escape) + tools.ok_(converters.to_xml(self.utf8_spanish) == self.utf8_spanish) + tools.ok_(converters.to_xml(self.latin1_spanish) == self.utf8_spanish) + tools.ok_(converters.to_xml(self.utf8_japanese) == self.utf8_japanese) + + def test_to_utf8(self): + tools.ok_(converters.to_utf8(self.u_japanese) == self.utf8_japanese) + tools.ok_(converters.to_utf8(self.utf8_spanish) == self.utf8_spanish) + + def test_to_str(self): + tools.ok_(converters.to_str(self.u_japanese) == self.utf8_japanese) + tools.ok_(converters.to_str(self.utf8_spanish) == self.utf8_spanish) + tools.ok_(converters.to_str(object) == "") + + def test_non_string(self): + '''Test deprecated non_string parameter''' + # unicode + tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'non_string': 'foo'}) + tools.ok_(converters.to_unicode(5, non_string='empty') == u'') + tools.ok_(converters.to_unicode(5, non_string='passthru') == 5) + tools.ok_(converters.to_unicode(5, non_string='simplerepr') == u'5') + tools.ok_(converters.to_unicode(5, non_string='repr') == u'5') + tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'non_string': 'strict'}) + + tools.ok_(converters.to_unicode(UnicodeNoStr(), non_string='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(StrNoUnicode(), non_string='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(StrReturnsUnicode(), non_string='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(UnicodeReturnsStr(), non_string='simplerepr') == self.u_spanish) + tools.ok_(converters.to_unicode(UnicodeStrCrossed(), non_string='simplerepr') == self.u_spanish) + + obj_repr = converters.to_unicode(object, non_string='simplerepr') + tools.ok_(obj_repr == u"" and isinstance(obj_repr, unicode)) + + # Bytes + tools.ok_(converters.to_bytes(5) == '5') + tools.ok_(converters.to_bytes(5, non_string='empty') == '') + tools.ok_(converters.to_bytes(5, non_string='passthru') == 5) + tools.ok_(converters.to_bytes(5, non_string='simplerepr') == '5') + tools.ok_(converters.to_bytes(5, non_string='repr') == '5') + + # Raise a TypeError if the msg is non_string and we're set to strict + tools.assert_raises(TypeError, converters.to_bytes, *[5], **{'non_string': 'strict'}) + # Raise a TypeError if given an invalid non_string arg + tools.assert_raises(TypeError, converters.to_bytes, *[5], **{'non_string': 'INVALID'}) + + # No __str__ method so this returns repr + string = converters.to_bytes(UnicodeNoStr(), non_string='simplerepr') + self._check_repr_bytes(string, 'UnicodeNoStr') + + # This object's _str__ returns a utf8 encoded object + tools.ok_(converters.to_bytes(StrNoUnicode(), non_string='simplerepr') == self.utf8_spanish) + + # This object's __str__ returns unicode which to_bytes converts to utf8 + tools.ok_(converters.to_bytes(StrReturnsUnicode(), non_string='simplerepr') == self.utf8_spanish) + # Unless we explicitly ask for something different + tools.ok_(converters.to_bytes(StrReturnsUnicode(), + non_string='simplerepr', encoding='latin1') == self.latin1_spanish) + + # This object has no __str__ so it returns repr + string = converters.to_bytes(UnicodeReturnsStr(), non_string='simplerepr') + self._check_repr_bytes(string, 'UnicodeReturnsStr') + + # This object's __str__ returns unicode which to_bytes converts to utf8 + tools.ok_(converters.to_bytes(UnicodeStrCrossed(), non_string='simplerepr') == self.utf8_spanish) + + # This object's __repr__ returns unicode which to_bytes converts to utf8 + tools.ok_(converters.to_bytes(ReprUnicode(), non_string='simplerepr') + == u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)'.encode('utf8')) + tools.ok_(converters.to_bytes(ReprUnicode(), non_string='repr') == + u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)'.encode('utf8')) + + obj_repr = converters.to_bytes(object, non_string='simplerepr') + tools.ok_(obj_repr == "" and isinstance(obj_repr, str)) diff --git a/tests/test_defaultdict.py b/tests/test_defaultdict.py new file mode 100644 index 0000000..12f5d57 --- /dev/null +++ b/tests/test_defaultdict.py @@ -0,0 +1,180 @@ +"""Unit tests for collections.defaultdict.""" + +import os +import copy +import tempfile +import unittest +from test import test_support + +from kitchen.pycompat25.collections._defaultdict import defaultdict + +def foobar(): + return list + +class TestDefaultDict(unittest.TestCase): + + def test_basic(self): + d1 = defaultdict() + self.assertEqual(d1.default_factory, None) + d1.default_factory = list + d1[12].append(42) + self.assertEqual(d1, {12: [42]}) + d1[12].append(24) + self.assertEqual(d1, {12: [42, 24]}) + d1[13] + d1[14] + self.assertEqual(d1, {12: [42, 24], 13: [], 14: []}) + self.assert_(d1[12] is not d1[13] is not d1[14]) + d2 = defaultdict(list, foo=1, bar=2) + self.assertEqual(d2.default_factory, list) + self.assertEqual(d2, {"foo": 1, "bar": 2}) + self.assertEqual(d2["foo"], 1) + self.assertEqual(d2["bar"], 2) + self.assertEqual(d2[42], []) + self.assert_("foo" in d2) + self.assert_("foo" in d2.keys()) + self.assert_("bar" in d2) + self.assert_("bar" in d2.keys()) + self.assert_(42 in d2) + self.assert_(42 in d2.keys()) + self.assert_(12 not in d2) + self.assert_(12 not in d2.keys()) + d2.default_factory = None + self.assertEqual(d2.default_factory, None) + try: + d2[15] + except KeyError, err: + self.assertEqual(err.args, (15,)) + else: + self.fail("d2[15] didn't raise KeyError") + self.assertRaises(TypeError, defaultdict, 1) + + def test_missing(self): + d1 = defaultdict() + self.assertRaises(KeyError, d1.__missing__, 42) + d1.default_factory = list + self.assertEqual(d1.__missing__(42), []) + + def test_repr(self): + d1 = defaultdict() + self.assertEqual(d1.default_factory, None) + self.assertEqual(repr(d1), "defaultdict(None, {})") + self.assertEqual(eval(repr(d1)), d1) + d1[11] = 41 + self.assertEqual(repr(d1), "defaultdict(None, {11: 41})") + d2 = defaultdict(int) + self.assertEqual(d2.default_factory, int) + d2[12] = 42 + self.assertEqual(repr(d2), "defaultdict(, {12: 42})") + def foo(): return 43 + d3 = defaultdict(foo) + + self.assert_(d3.default_factory is foo) + d3[13] + self.assertEqual(repr(d3), "defaultdict(%s, {13: 43})" % repr(foo)) + + def test_print(self): + d1 = defaultdict() + def foo(): return 42 + d2 = defaultdict(foo, {1: 2}) + # NOTE: We can't use tempfile.[Named]TemporaryFile since this + # code must exercise the tp_print C code, which only gets + # invoked for *real* files. + tfn = tempfile.mktemp() + try: + f = open(tfn, "w+") + try: + print >>f, d1 + print >>f, d2 + f.seek(0) + self.assertEqual(f.readline(), repr(d1) + "\n") + self.assertEqual(f.readline(), repr(d2) + "\n") + finally: + f.close() + finally: + os.remove(tfn) + + def test_copy(self): + d1 = defaultdict() + d2 = d1.copy() + self.assertEqual(type(d2), defaultdict) + self.assertEqual(d2.default_factory, None) + self.assertEqual(d2, {}) + d1.default_factory = list + d3 = d1.copy() + self.assertEqual(type(d3), defaultdict) + self.assertEqual(d3.default_factory, list) + self.assertEqual(d3, {}) + d1[42] + d4 = d1.copy() + self.assertEqual(type(d4), defaultdict) + self.assertEqual(d4.default_factory, list) + self.assertEqual(d4, {42: []}) + d4[12] + self.assertEqual(d4, {42: [], 12: []}) + + # Issue 6637: Copy fails for empty default dict + d = defaultdict() + d['a'] = 42 + e = d.copy() + self.assertEqual(e['a'], 42) + + def test_shallow_copy(self): + d1 = defaultdict(foobar, {1: 1}) + d2 = copy.copy(d1) + self.assertEqual(d2.default_factory, foobar) + self.assertEqual(d2, d1) + d1.default_factory = list + d2 = copy.copy(d1) + self.assertEqual(d2.default_factory, list) + self.assertEqual(d2, d1) + + def test_deep_copy(self): + d1 = defaultdict(foobar, {1: [1]}) + d2 = copy.deepcopy(d1) + self.assertEqual(d2.default_factory, foobar) + self.assertEqual(d2, d1) + self.assert_(d1[1] is not d2[1]) + d1.default_factory = list + d2 = copy.deepcopy(d1) + self.assertEqual(d2.default_factory, list) + self.assertEqual(d2, d1) + + def test_keyerror_without_factory(self): + d1 = defaultdict() + try: + d1[(1,)] + except KeyError, err: + self.assertEqual(err.args[0], (1,)) + else: + self.fail("expected KeyError") + + def test_recursive_repr(self): + # Issue2045: stack overflow when default_factory is a bound method + class sub(defaultdict): + def __init__(self): + self.default_factory = self._factory + def _factory(self): + return [] + d = sub() + self.assert_(repr(d).startswith( + "defaultdict(>f, d + finally: + f.close() + finally: + os.remove(tfn) + + +#def test_main(): +# test_support.run_unittest(TestDefaultDict) +# +#if __name__ == "__main__": +# test_main() diff --git a/tests/test_deprecation.py b/tests/test_deprecation.py new file mode 100644 index 0000000..4ca2b1b --- /dev/null +++ b/tests/test_deprecation.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +import sys +import warnings +from kitchen.text import converters +from kitchen.text import utf8 + +class TestDeprecated(unittest.TestCase): + def setUp(self): + registry = sys._getframe(2).f_globals.get('__warningregistry__') + if registry: + registry.clear() + registry = sys._getframe(1).f_globals.get('__warningregistry__') + if registry: + registry.clear() + warnings.simplefilter('error', DeprecationWarning) + + def tearDown(self): + warnings.simplefilter('default', DeprecationWarning) + + def test_deprecated_functions(self): + '''Test that all deprecated functions raise DeprecationWarning''' + tools.assert_raises(DeprecationWarning, converters.to_utf8, u'café') + tools.assert_raises(DeprecationWarning, converters.to_str, 5) + tools.assert_raises(DeprecationWarning, converters.to_xml, 'test') + + tools.assert_raises(DeprecationWarning, utf8.utf8_valid, 'test') + tools.assert_raises(DeprecationWarning, utf8.utf8_width, 'test') + tools.assert_raises(DeprecationWarning, utf8.utf8_width_chop, 'test') + tools.assert_raises(DeprecationWarning, utf8.utf8_width_fill, 'test', 'asd') + tools.assert_raises(DeprecationWarning, utf8.utf8_text_wrap, 'test') + tools.assert_raises(DeprecationWarning, utf8.utf8_text_fill, 'test') + tools.assert_raises(DeprecationWarning, utf8._utf8_width_le, 'test') + + def test_deprecated_parameters(self): + tools.assert_raises(DeprecationWarning, converters.to_unicode, *[5], + **{'non_string': 'simplerepr'}) + tools.assert_raises(DeprecationWarning, converters.to_unicode, *[5], + **{'nonstring': 'simplerepr', 'non_string': 'simplerepr'}) + + tools.assert_raises(DeprecationWarning, converters.to_bytes, *[5], + **{'non_string': 'simplerepr'}) + tools.assert_raises(DeprecationWarning, converters.to_bytes, *[5], + **{'nonstring': 'simplerepr', 'non_string': 'simplerepr'}) diff --git a/tests/test_i18n.py b/tests/test_i18n.py new file mode 100644 index 0000000..62039ab --- /dev/null +++ b/tests/test_i18n.py @@ -0,0 +1,749 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +import os +import types + +from kitchen import i18n + +import base_classes + +class TestI18N_UTF8(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.UTF8' + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_easy_gettext_setup(self): + '''Test that the easy_gettext_setup function works + ''' + _, N_ = i18n.easy_gettext_setup('foo', localedirs= + ['%s/data/locale/' % os.path.dirname(__file__)]) + tools.ok_(isinstance(_, types.MethodType)) + tools.ok_(isinstance(N_, types.MethodType)) + tools.ok_(_.im_func.func_name == 'ugettext') + tools.ok_(N_.im_func.func_name == 'ungettext') + + tools.ok_(_('café') == u'café') + tools.ok_(_(u'café') == u'café') + tools.ok_(N_('café', 'cafés', 1) == u'café') + tools.ok_(N_('café', 'cafés', 2) == u'cafés') + tools.ok_(N_(u'café', u'cafés', 1) == u'café') + tools.ok_(N_(u'café', u'cafés', 2) == u'cafés') + + def test_easy_gettext_setup_non_unicode(self): + '''Test that the easy_gettext_setup function works + ''' + b_, bN_ = i18n.easy_gettext_setup('foo', localedirs= + ['%s/data/locale/' % os.path.dirname(__file__)], + use_unicode=False) + tools.ok_(isinstance(b_, types.MethodType)) + tools.ok_(isinstance(bN_, types.MethodType)) + tools.ok_(b_.im_func.func_name == 'lgettext') + tools.ok_(bN_.im_func.func_name == 'lngettext') + + tools.ok_(b_('café') == 'café') + tools.ok_(b_(u'café') == 'café') + tools.ok_(bN_('café', 'cafés', 1) == 'café') + tools.ok_(bN_('café', 'cafés', 2) == 'cafés') + tools.ok_(bN_(u'café', u'cafés', 1) == 'café') + tools.ok_(bN_(u'café', u'cafés', 2) == 'cafés') + + def test_get_translation_object(self): + '''Test that the get_translation_object function works + ''' + translations = i18n.get_translation_object('foo', ['%s/data/locale/' % os.path.dirname(__file__)]) + tools.ok_(translations.__class__==i18n.DummyTranslations) + tools.assert_raises(IOError, i18n.get_translation_object, 'foo', ['%s/data/locale/' % os.path.dirname(__file__)], fallback=False) + + translations = i18n.get_translation_object('test', ['%s/data/locale/' % os.path.dirname(__file__)]) + tools.ok_(translations.__class__==i18n.NewGNUTranslations) + + def test_get_translation_object_create_fallback(self): + '''Test get_translation_object creates fallbacks for additional catalogs''' + translations = i18n.get_translation_object('test', + ['%s/data/locale' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)]) + tools.ok_(translations.__class__==i18n.NewGNUTranslations) + tools.ok_(translations._fallback.__class__==i18n.NewGNUTranslations) + + def test_get_translation_object_copy(self): + '''Test get_translation_object shallow copies the message catalog''' + translations = i18n.get_translation_object('test', + ['%s/data/locale' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)], codeset='utf-8') + translations.input_charset = 'utf-8' + translations2 = i18n.get_translation_object('test', + ['%s/data/locale' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)], codeset='latin-1') + translations2.input_charset = 'latin-1' + + # Test that portions of the translation objects are the same and other + # portions are different (which is a space optimization so that the + # translation data isn't in memory multiple times) + tools.ok_(id(translations._fallback) != id(translations2._fallback)) + tools.ok_(id(translations.output_charset()) != id(translations2.output_charset())) + tools.ok_(id(translations.input_charset) != id(translations2.input_charset)) + tools.ok_(id(translations.input_charset) != id(translations2.input_charset)) + tools.eq_(id(translations._catalog), id(translations2._catalog)) + + def test_get_translation_object_optional_params(self): + '''Smoketest leaving out optional parameters''' + translations = i18n.get_translation_object('test') + tools.ok_(translations.__class__ in (i18n.NewGNUTranslations, i18n.DummyTranslations)) + + def test_dummy_translation(self): + '''Test that we can create a DummyTranslation object + ''' + tools.ok_(isinstance(i18n.DummyTranslations(), i18n.DummyTranslations)) + +# Note: Using nose's generator tests for this so we can't subclass +# unittest.TestCase +class TestDummyTranslations(base_classes.UnicodeTestData): + def __init__(self): + self.test_data = {'bytes': (( # First set is with default charset (utf8) + (self.u_ascii, self.b_ascii), + (self.u_spanish, self.utf8_spanish), + (self.u_japanese, self.utf8_japanese), + (self.b_ascii, self.b_ascii), + (self.utf8_spanish, self.utf8_spanish), + (self.latin1_spanish, self.utf8_mangled_spanish_latin1_as_utf8), + (self.utf8_japanese, self.utf8_japanese), + ), + ( # Second set is with output_charset of latin1 (ISO-8859-1) + (self.u_ascii, self.b_ascii), + (self.u_spanish, self.latin1_spanish), + (self.u_japanese, self.latin1_mangled_japanese_replace_as_latin1), + (self.b_ascii, self.b_ascii), + (self.utf8_spanish, self.utf8_spanish), + (self.latin1_spanish, self.latin1_spanish), + (self.utf8_japanese, self.utf8_japanese), + ), + ( # Third set is with output_charset of C + (self.u_ascii, self.b_ascii), + (self.u_spanish, self.ascii_mangled_spanish_as_ascii), + (self.u_japanese, self.ascii_mangled_japanese_replace_as_latin1), + (self.b_ascii, self.b_ascii), + (self.utf8_spanish, self.ascii_mangled_spanish_as_ascii), + (self.latin1_spanish, self.ascii_twice_mangled_spanish_latin1_as_utf8_as_ascii), + (self.utf8_japanese, self.ascii_mangled_japanese_replace_as_latin1), + ), + ), + 'unicode': (( # First set is with the default charset (utf8) + (self.u_ascii, self.u_ascii), + (self.u_spanish, self.u_spanish), + (self.u_japanese, self.u_japanese), + (self.b_ascii, self.u_ascii), + (self.utf8_spanish, self.u_spanish), + (self.latin1_spanish, self.u_mangled_spanish_latin1_as_utf8), # String is mangled but no exception + (self.utf8_japanese, self.u_japanese), + ), + ( # Second set is with _charset of latin1 (ISO-8859-1) + (self.u_ascii, self.u_ascii), + (self.u_spanish, self.u_spanish), + (self.u_japanese, self.u_japanese), + (self.b_ascii, self.u_ascii), + (self.utf8_spanish, self.u_mangled_spanish_utf8_as_latin1), # String mangled but no exception + (self.latin1_spanish, self.u_spanish), + (self.utf8_japanese, self.u_mangled_japanese_utf8_as_latin1), # String mangled but no exception + ), + ( # Third set is with _charset of C + (self.u_ascii, self.u_ascii), + (self.u_spanish, self.u_spanish), + (self.u_japanese, self.u_japanese), + (self.b_ascii, self.u_ascii), + (self.utf8_spanish, self.u_mangled_spanish_utf8_as_ascii), # String mangled but no exception + (self.latin1_spanish, self.u_mangled_spanish_latin1_as_ascii), # String mangled but no exception + (self.utf8_japanese, self.u_mangled_japanese_utf8_as_ascii), # String mangled but no exception + ), + ) + } + + def setUp(self): + self.translations = i18n.DummyTranslations() + + def check_gettext(self, message, value, charset=None): + self.translations.set_output_charset(charset) + tools.eq_(self.translations.gettext(message), value, + msg='gettext(%s): trans: %s != val: %s (charset=%s)' + % (repr(message), repr(self.translations.gettext(message)), + repr(value), charset)) + + def check_lgettext(self, message, value, charset=None, + locale='en_US.UTF-8'): + os.environ['LC_ALL'] = locale + self.translations.set_output_charset(charset) + tools.eq_(self.translations.lgettext(message), value, + msg='lgettext(%s): trans: %s != val: %s (charset=%s, locale=%s)' + % (repr(message), repr(self.translations.lgettext(message)), + repr(value), charset, locale)) + + # Note: charset has a default value because nose isn't invoking setUp and + # tearDown each time check_* is run. + def check_ugettext(self, message, value, charset='utf-8'): + '''ugettext method with default values''' + self.translations.input_charset = charset + tools.eq_(self.translations.ugettext(message), value, + msg='ugettext(%s): trans: %s != val: %s (charset=%s)' + % (repr(message), repr(self.translations.ugettext(message)), + repr(value), charset)) + + def check_ngettext(self, message, value, charset=None): + self.translations.set_output_charset(charset) + tools.eq_(self.translations.ngettext(message, 'blank', 1), value) + tools.eq_(self.translations.ngettext('blank', message, 2), value) + tools.ok_(self.translations.ngettext(message, 'blank', 2) != value) + tools.ok_(self.translations.ngettext('blank', message, 1) != value) + + def check_lngettext(self, message, value, charset=None, locale='en_US.UTF-8'): + os.environ['LC_ALL'] = locale + self.translations.set_output_charset(charset) + tools.eq_(self.translations.lngettext(message, 'blank', 1), value, + msg='lngettext(%s, "blank", 1): trans: %s != val: %s (charset=%s, locale=%s)' + % (repr(message), repr(self.translations.lngettext(message, + 'blank', 1)), repr(value), charset, locale)) + tools.eq_(self.translations.lngettext('blank', message, 2), value, + msg='lngettext("blank", %s, 2): trans: %s != val: %s (charset=%s, locale=%s)' + % (repr(message), repr(self.translations.lngettext('blank', + message, 2)), repr(value), charset, locale)) + tools.ok_(self.translations.lngettext(message, 'blank', 2) != value, + msg='lngettext(%s, "blank", 2): trans: %s != val: %s (charset=%s, locale=%s)' + % (repr(message), repr(self.translations.lngettext(message, + 'blank', 2)), repr(value), charset, locale)) + tools.ok_(self.translations.lngettext('blank', message, 1) != value, + msg='lngettext("blank", %s, 1): trans: %s != val: %s (charset=%s, locale=%s)' + % (repr(message), repr(self.translations.lngettext('blank', + message, 1)), repr(value), charset, locale)) + + # Note: charset has a default value because nose isn't invoking setUp and + # tearDown each time check_* is run. + def check_ungettext(self, message, value, charset='utf-8'): + self.translations.input_charset = charset + tools.eq_(self.translations.ungettext(message, 'blank', 1), value) + tools.eq_(self.translations.ungettext('blank', message, 2), value) + tools.ok_(self.translations.ungettext(message, 'blank', 2) != value) + tools.ok_(self.translations.ungettext('blank', message, 1) != value) + + def test_gettext(self): + '''gettext method with default values''' + for message, value in self.test_data['bytes'][0]: + yield self.check_gettext, message, value + + def test_gettext_output_charset(self): + '''gettext method after output_charset is set''' + for message, value in self.test_data['bytes'][1]: + yield self.check_gettext, message, value, 'latin1' + + def test_ngettext(self): + for message, value in self.test_data['bytes'][0]: + yield self.check_ngettext, message, value + + def test_ngettext_output_charset(self): + for message, value in self.test_data['bytes'][1]: + yield self.check_ngettext, message, value, 'latin1' + + def test_lgettext(self): + '''lgettext method with default values on a utf8 locale''' + for message, value in self.test_data['bytes'][0]: + yield self.check_lgettext, message, value + + def test_lgettext_output_charset(self): + '''lgettext method after output_charset is set''' + for message, value in self.test_data['bytes'][1]: + yield self.check_lgettext, message, value, 'latin1' + + def test_lgettext_output_charset_and_locale(self): + '''lgettext method after output_charset is set in C locale + + output_charset should take precedence + ''' + for message, value in self.test_data['bytes'][1]: + yield self.check_lgettext, message, value, 'latin1', 'C' + + def test_lgettext_locale_C(self): + '''lgettext method in a C locale''' + for message, value in self.test_data['bytes'][2]: + yield self.check_lgettext, message, value, None, 'C' + + def test_lngettext(self): + '''lngettext method with default values on a utf8 locale''' + for message, value in self.test_data['bytes'][0]: + yield self.check_lngettext, message, value + + def test_lngettext_output_charset(self): + '''lngettext method after output_charset is set''' + for message, value in self.test_data['bytes'][1]: + yield self.check_lngettext, message, value, 'latin1' + + def test_lngettext_output_charset_and_locale(self): + '''lngettext method after output_charset is set in C locale + + output_charset should take precedence + ''' + for message, value in self.test_data['bytes'][1]: + yield self.check_lngettext, message, value, 'latin1', 'C' + + def test_lngettext_locale_C(self): + '''lngettext method in a C locale''' + for message, value in self.test_data['bytes'][2]: + yield self.check_lngettext, message, value, None, 'C' + + def test_ugettext(self): + for message, value in self.test_data['unicode'][0]: + yield self.check_ugettext, message, value + + def test_ugettext_charset_latin1(self): + for message, value in self.test_data['unicode'][1]: + yield self.check_ugettext, message, value, 'latin1' + + def test_ugettext_charset_ascii(self): + for message, value in self.test_data['unicode'][2]: + yield self.check_ugettext, message, value, 'ascii' + + def test_ungettext(self): + for message, value in self.test_data['unicode'][0]: + yield self.check_ungettext, message, value + + def test_ungettext_charset_latin1(self): + for message, value in self.test_data['unicode'][1]: + yield self.check_ungettext, message, value, 'latin1' + + def test_ungettext_charset_ascii(self): + for message, value in self.test_data['unicode'][2]: + yield self.check_ungettext, message, value, 'ascii' + + def test_nonbasestring(self): + tools.eq_(self.translations.gettext(dict(hi='there')), '') + tools.eq_(self.translations.ngettext(dict(hi='there'), dict(hi='two'), 1), '') + tools.eq_(self.translations.lgettext(dict(hi='there')), '') + tools.eq_(self.translations.lngettext(dict(hi='there'), dict(hi='two'), 1), '') + tools.eq_(self.translations.ugettext(dict(hi='there')), u'') + tools.eq_(self.translations.ungettext(dict(hi='there'), dict(hi='two'), 1), u'') + + +class TestI18N_Latin1(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.ISO8859-1' + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_easy_gettext_setup_non_unicode(self): + '''Test that the easy_gettext_setup function works + ''' + b_, bN_ = i18n.easy_gettext_setup('foo', localedirs= + ['%s/data/locale/' % os.path.dirname(__file__)], + use_unicode=False) + + tools.ok_(b_('café') == 'café') + tools.ok_(b_(u'café') == 'caf\xe9') + tools.ok_(bN_('café', 'cafés', 1) == 'café') + tools.ok_(bN_('café', 'cafés', 2) == 'cafés') + tools.ok_(bN_(u'café', u'cafés', 1) == 'caf\xe9') + tools.ok_(bN_(u'café', u'cafés', 2) == 'caf\xe9s') + + +class TestNewGNUTranslationsNoMatch(TestDummyTranslations): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.utf8' + self.translations = i18n.get_translation_object('test', ['%s/data/locale/' % os.path.dirname(__file__)]) + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + +class TestNewGNURealTranslations_UTF8(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.UTF8' + self.translations = i18n.get_translation_object('test', ['%s/data/locale/' % os.path.dirname(__file__)]) + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_gettext(self): + _ = self.translations.gettext + tools.ok_(_('kitchen sink')=='pia da cozinha') + tools.ok_(_('Kuratomi')=='くらとみ') + tools.ok_(_('くらとみ')=='Kuratomi') + tools.ok_(_('Only café in fallback')=='Only café in fallback') + + tools.ok_(_(u'kitchen sink')=='pia da cozinha') + tools.ok_(_(u'くらとみ')=='Kuratomi') + tools.ok_(_(u'Kuratomi')=='くらとみ') + tools.ok_(_(u'Only café in fallback')=='Only café in fallback') + + def test_ngettext(self): + _ = self.translations.ngettext + tools.ok_(_('1 lemon', '4 lemons', 1)=='一 limão') + tools.ok_(_('一 limão', '四 limões', 1)=='1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)=='一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)=='1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)=='四 limões') + tools.ok_(_('一 limão', '四 limões', 2)=='4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)=='四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)=='4 lemons') + + def test_lgettext(self): + _ = self.translations.lgettext + tools.ok_(_('kitchen sink')=='pia da cozinha') + tools.ok_(_('Kuratomi')=='くらとみ') + tools.ok_(_('くらとみ')=='Kuratomi') + tools.ok_(_('Only café in fallback')=='Only café in fallback') + + tools.ok_(_(u'kitchen sink')=='pia da cozinha') + tools.ok_(_(u'くらとみ')=='Kuratomi') + tools.ok_(_(u'Kuratomi')=='くらとみ') + tools.ok_(_(u'Only café in fallback')=='Only café in fallback') + + def test_lngettext(self): + _ = self.translations.lngettext + tools.ok_(_('1 lemon', '4 lemons', 1)=='一 limão') + tools.ok_(_('一 limão', '四 limões', 1)=='1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)=='一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)=='1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)=='四 limões') + tools.ok_(_('一 limão', '四 limões', 2)=='4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)=='四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)=='4 lemons') + + def test_ugettext(self): + _ = self.translations.ugettext + tools.ok_(_('kitchen sink')==u'pia da cozinha') + tools.ok_(_('Kuratomi')==u'くらとみ') + tools.ok_(_('くらとみ')==u'Kuratomi') + tools.ok_(_('Only café in fallback')==u'Only café in fallback') + + tools.ok_(_(u'kitchen sink')==u'pia da cozinha') + tools.ok_(_(u'くらとみ')==u'Kuratomi') + tools.ok_(_(u'Kuratomi')==u'くらとみ') + tools.ok_(_(u'Only café in fallback')==u'Only café in fallback') + + def test_ungettext(self): + _ = self.translations.ungettext + tools.ok_(_('1 lemon', '4 lemons', 1)==u'一 limão') + tools.ok_(_('一 limão', '四 limões', 1)==u'1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)==u'一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)==u'1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)==u'四 limões') + tools.ok_(_('一 limão', '四 limões', 2)==u'4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)==u'四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)==u'4 lemons') + + +class TestNewGNURealTranslations_Latin1(TestNewGNURealTranslations_UTF8): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.ISO8859-1' + self.translations = i18n.get_translation_object('test', ['%s/data/locale/' % os.path.dirname(__file__)]) + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_lgettext(self): + _ = self.translations.lgettext + tools.eq_(_('kitchen sink'), 'pia da cozinha') + tools.eq_(_('Kuratomi'), '????') + tools.eq_(_('くらとみ'), 'Kuratomi') + # The following returns utf-8 because latin-1 can hold all of the + # bytes that are present in utf-8 encodings. Therefore, we cannot + # tell that we should reencode the string. This will be displayed as + # mangled text if used in a program + tools.eq_(_('Only café in fallback'), 'Only caf\xc3\xa9 in fallback') + + tools.eq_(_(u'kitchen sink'), 'pia da cozinha') + tools.eq_(_(u'くらとみ'), 'Kuratomi') + tools.eq_(_(u'Kuratomi'), '????') + tools.eq_(_(u'Only café in fallback'), 'Only caf\xe9 in fallback') + + def test_lngettext(self): + _ = self.translations.lngettext + tools.ok_(_('1 lemon', '4 lemons', 1)=='? lim\xe3o') + tools.ok_(_('一 limão', '四 limões', 1)=='1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)=='? lim\xe3o') + tools.ok_(_(u'一 limão', u'四 limões', 1)=='1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)=='? lim\xf5es') + tools.ok_(_('一 limão', '四 limões', 2)=='4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)=='? lim\xf5es') + tools.ok_(_(u'一 limão', u'四 limões', 2)=='4 lemons') + + +class TestFallbackNewGNUTranslationsNoMatch(TestDummyTranslations): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.utf8' + self.translations = i18n.get_translation_object('test', + ['%s/data/locale/' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)]) + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + +class TestFallbackNewGNURealTranslations_UTF8(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.UTF8' + self.translations = i18n.get_translation_object('test', + ['%s/data/locale/' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)]) + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_gettext(self): + _ = self.translations.gettext + tools.ok_(_('kitchen sink')=='pia da cozinha') + tools.ok_(_('Kuratomi')=='くらとみ') + tools.ok_(_('くらとみ')=='Kuratomi') + tools.ok_(_(u'Only café in fallback')=='Yes, only caf\xc3\xa9 in fallback') + + tools.ok_(_(u'kitchen sink')=='pia da cozinha') + tools.ok_(_(u'くらとみ')=='Kuratomi') + tools.ok_(_(u'Kuratomi')=='くらとみ') + tools.ok_(_(u'Only café in fallback')=='Yes, only caf\xc3\xa9 in fallback') + + def test_ngettext(self): + _ = self.translations.ngettext + tools.ok_(_('1 lemon', '4 lemons', 1)=='一 limão') + tools.ok_(_('一 limão', '四 limões', 1)=='1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)=='一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)=='1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)=='四 limões') + tools.ok_(_('一 limão', '四 limões', 2)=='4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)=='四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)=='4 lemons') + + def test_lgettext(self): + _ = self.translations.lgettext + tools.eq_(_('kitchen sink'), 'pia da cozinha') + tools.ok_(_('Kuratomi')=='くらとみ') + tools.ok_(_('くらとみ')=='Kuratomi') + tools.ok_(_('Only café in fallback')=='Yes, only caf\xc3\xa9 in fallback') + + tools.ok_(_(u'kitchen sink')=='pia da cozinha') + tools.ok_(_(u'くらとみ')=='Kuratomi') + tools.ok_(_(u'Kuratomi')=='くらとみ') + tools.ok_(_(u'Only café in fallback')=='Yes, only caf\xc3\xa9 in fallback') + + def test_lngettext(self): + _ = self.translations.lngettext + tools.ok_(_('1 lemon', '4 lemons', 1)=='一 limão') + tools.ok_(_('一 limão', '四 limões', 1)=='1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)=='一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)=='1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)=='四 limões') + tools.ok_(_('一 limão', '四 limões', 2)=='4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)=='四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)=='4 lemons') + + def test_ugettext(self): + _ = self.translations.ugettext + tools.ok_(_('kitchen sink')==u'pia da cozinha') + tools.ok_(_('Kuratomi')==u'くらとみ') + tools.ok_(_('くらとみ')==u'Kuratomi') + tools.ok_(_('Only café in fallback')==u'Yes, only caf\xe9 in fallback') + + tools.ok_(_(u'kitchen sink')==u'pia da cozinha') + tools.ok_(_(u'くらとみ')==u'Kuratomi') + tools.ok_(_(u'Kuratomi')==u'くらとみ') + tools.ok_(_(u'Only café in fallback')==u'Yes, only caf\xe9 in fallback') + + def test_ungettext(self): + _ = self.translations.ungettext + tools.ok_(_('1 lemon', '4 lemons', 1)==u'一 limão') + tools.ok_(_('一 limão', '四 limões', 1)==u'1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)==u'一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)==u'1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)==u'四 limões') + tools.ok_(_('一 limão', '四 limões', 2)==u'4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)==u'四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)==u'4 lemons') + + +class TestFallbackNewGNURealTranslations_Latin1(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.ISO8859-1' + self.translations = i18n.get_translation_object('test', + ['%s/data/locale/' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)]) + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_gettext(self): + _ = self.translations.gettext + tools.ok_(_('kitchen sink')=='pia da cozinha') + tools.ok_(_('Kuratomi')=='くらとみ') + tools.ok_(_('くらとみ')=='Kuratomi') + tools.ok_(_(u'Only café in fallback')=='Yes, only caf\xc3\xa9 in fallback') + + tools.ok_(_(u'kitchen sink')=='pia da cozinha') + tools.ok_(_(u'くらとみ')=='Kuratomi') + tools.ok_(_(u'Kuratomi')=='くらとみ') + tools.ok_(_(u'Only café in fallback')=='Yes, only caf\xc3\xa9 in fallback') + + def test_ngettext(self): + _ = self.translations.ngettext + tools.ok_(_('1 lemon', '4 lemons', 1)=='一 limão') + tools.ok_(_('一 limão', '四 limões', 1)=='1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)=='一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)=='1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)=='四 limões') + tools.ok_(_('一 limão', '四 limões', 2)=='4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)=='四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)=='4 lemons') + + def test_lgettext(self): + _ = self.translations.lgettext + tools.eq_(_('kitchen sink'), 'pia da cozinha') + tools.eq_(_('Kuratomi'), '????') + tools.eq_(_('くらとみ'), 'Kuratomi') + tools.eq_(_('Only café in fallback'), 'Yes, only caf\xe9 in fallback') + + tools.eq_(_(u'kitchen sink'), 'pia da cozinha') + tools.eq_(_(u'くらとみ'), 'Kuratomi') + tools.eq_(_(u'Kuratomi'), '????') + tools.eq_(_(u'Only café in fallback'), 'Yes, only caf\xe9 in fallback') + + def test_lngettext(self): + _ = self.translations.lngettext + tools.eq_(_('1 lemon', '4 lemons', 1), u'一 limão'.encode('latin1', 'replace')) + tools.eq_(_('一 limão', '四 limões', 1), '1 lemon') + tools.eq_(_(u'1 lemon', u'4 lemons', 1), u'一 limão'.encode('latin1', 'replace')) + tools.eq_(_(u'一 limão', u'四 limões', 1), '1 lemon') + + tools.eq_(_('1 lemon', '4 lemons', 2), u'四 limões'.encode('latin1', 'replace')) + tools.eq_(_('一 limão', '四 limões', 2), '4 lemons') + tools.eq_(_(u'1 lemon', u'4 lemons', 2), u'四 limões'.encode('latin1', 'replace')) + tools.eq_(_(u'一 limão', u'四 limões', 2), '4 lemons') + + def test_ugettext(self): + _ = self.translations.ugettext + tools.ok_(_('kitchen sink')==u'pia da cozinha') + tools.ok_(_('Kuratomi')==u'くらとみ') + tools.ok_(_('くらとみ')==u'Kuratomi') + tools.ok_(_('Only café in fallback')==u'Yes, only caf\xe9 in fallback') + + tools.ok_(_(u'kitchen sink')==u'pia da cozinha') + tools.ok_(_(u'くらとみ')==u'Kuratomi') + tools.ok_(_(u'Kuratomi')==u'くらとみ') + tools.ok_(_(u'Only café in fallback')==u'Yes, only caf\xe9 in fallback') + + def test_ungettext(self): + _ = self.translations.ungettext + tools.ok_(_('1 lemon', '4 lemons', 1)==u'一 limão') + tools.ok_(_('一 limão', '四 limões', 1)==u'1 lemon') + tools.ok_(_(u'1 lemon', u'4 lemons', 1)==u'一 limão') + tools.ok_(_(u'一 limão', u'四 limões', 1)==u'1 lemon') + + tools.ok_(_('1 lemon', '4 lemons', 2)==u'四 limões') + tools.ok_(_('一 limão', '四 limões', 2)==u'4 lemons') + tools.ok_(_(u'1 lemon', u'4 lemons', 2)==u'四 limões') + tools.ok_(_(u'一 limão', u'四 limões', 2)==u'4 lemons') + + +class TestFallback(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.ISO8859-1' + self.gtranslations = i18n.get_translation_object('test', + ['%s/data/locale/' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)]) + self.gtranslations.add_fallback(object()) + self.dtranslations = i18n.get_translation_object('nonexistent', + ['%s/data/locale/' % os.path.dirname(__file__), + '%s/data/locale-old' % os.path.dirname(__file__)]) + self.dtranslations.add_fallback(object()) + + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + + def test_invalid_fallback_no_raise(self): + '''Test when we have an invalid fallback that it does not raise.''' + tools.eq_(self.gtranslations.gettext('abc'), 'abc') + tools.eq_(self.gtranslations.ugettext('abc'), 'abc') + tools.eq_(self.gtranslations.lgettext('abc'), 'abc') + tools.eq_(self.dtranslations.gettext('abc'), 'abc') + tools.eq_(self.dtranslations.ugettext('abc'), 'abc') + tools.eq_(self.dtranslations.lgettext('abc'), 'abc') + + tools.eq_(self.dtranslations.ngettext('abc', 'cde', 1), 'abc') + tools.eq_(self.dtranslations.ungettext('abc', 'cde', 1), 'abc') + tools.eq_(self.dtranslations.lngettext('abc', 'cde', 1), 'abc') + tools.eq_(self.gtranslations.ngettext('abc', 'cde', 1), 'abc') + tools.eq_(self.gtranslations.ungettext('abc', 'cde', 1), 'abc') + tools.eq_(self.gtranslations.lngettext('abc', 'cde', 1), 'abc') + +class TestDefaultLocaleDir(unittest.TestCase): + def setUp(self): + self.old_LC_ALL = os.environ.get('LC_ALL', None) + os.environ['LC_ALL'] = 'pt_BR.UTF8' + self.old_DEFAULT_LOCALEDIRS = i18n._DEFAULT_LOCALEDIR + i18n._DEFAULT_LOCALEDIR = '%s/data/locale/' % os.path.dirname(__file__) + self.translations = i18n.get_translation_object('test') + + def tearDown(self): + if self.old_LC_ALL: + os.environ['LC_ALL'] = self.old_LC_ALL + else: + del(os.environ['LC_ALL']) + if self.old_DEFAULT_LOCALEDIRS: + i18n._DEFAULT_LOCALEDIR = self.old_DEFAULT_LOCALEDIRS + + def test_gettext(self): + _ = self.translations.gettext + tools.eq_(_('kitchen sink'), 'pia da cozinha') + tools.eq_(_('Kuratomi'), 'くらとみ') + tools.eq_(_('くらとみ'), 'Kuratomi') + tools.eq_(_('Only café in fallback'), 'Only café in fallback') + + tools.eq_(_(u'kitchen sink'), 'pia da cozinha') + tools.eq_(_(u'くらとみ'), 'Kuratomi') + tools.eq_(_(u'Kuratomi'), 'くらとみ') + tools.eq_(_(u'Only café in fallback'), 'Only café in fallback') + + diff --git a/tests/test_iterutils.py b/tests/test_iterutils.py new file mode 100644 index 0000000..dde67be --- /dev/null +++ b/tests/test_iterutils.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +from kitchen import iterutils + +class TestStrictDict(unittest.TestCase): + iterable_data = ( + [0, 1, 2], + [], + (0, 1, 2), + tuple(), + set([0, 1, 2]), + set(), + dict(a=1, b=2), + dict(), + [None], + [False], + [0], + xrange(0, 3), + iter([1, 2, 3]), + ) + non_iterable_data = ( + None, + False, + True, + 0, + 1.1, + ) + + def test_isiterable(self): + for item in self.iterable_data: + tools.ok_(iterutils.isiterable(item) == True) + + for item in self.non_iterable_data: + tools.ok_(iterutils.isiterable(item) == False) + + # strings + tools.ok_(iterutils.isiterable('a', include_string=True) == True) + tools.ok_(iterutils.isiterable('a', include_string=False) == False) + tools.ok_(iterutils.isiterable('a') == False) + + def test_iterate(self): + iterutils.iterate(None) + for item in self.non_iterable_data: + tools.ok_(list(iterutils.iterate(item)) == [item]) + + for item in self.iterable_data[:-1]: + tools.ok_(list(iterutils.iterate(item)) == list(item)) + + # iter() is exhausted after use so we have to test separately + tools.ok_(list(iterutils.iterate(iter([1, 2, 3]))) == [1, 2, 3]) + + # strings + tools.ok_(list(iterutils.iterate('abc')) == ['abc']) + tools.ok_(list(iterutils.iterate('abc', include_string=True)) == ['a', 'b', 'c']) diff --git a/tests/test_pycompat.py b/tests/test_pycompat.py new file mode 100644 index 0000000..50a059b --- /dev/null +++ b/tests/test_pycompat.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +class TestUsableModules(unittest.TestCase): + def test_subprocess(self): + '''Test that importing subprocess as a module works + ''' + try: + from kitchen.pycompat24.subprocess import Popen + except ImportError: + tools.ok_(False, 'Unable to import pycompat24.subprocess as a module') + try: + from kitchen.pycompat27.subprocess import Popen + except ImportError: + tools.ok_(False, 'Unable to import pycompat27.subprocess as a module') + + def test_base64(self): + '''Test that importing base64 as a module works + ''' + try: + from kitchen.pycompat24.base64 import b64encode + except ImportError: + tools.ok_(False, 'Unable to import pycompat24.base64 as a module') diff --git a/tests/test_pycompat24.py b/tests/test_pycompat24.py new file mode 100644 index 0000000..adea7fe --- /dev/null +++ b/tests/test_pycompat24.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools +from nose.plugins.skip import SkipTest + +import __builtin__ +import base64 as py_b64 +import warnings + +from kitchen.pycompat24 import sets +from kitchen.pycompat24.base64 import _base64 as base64 + +class TestSetsNoOverwrite(unittest.TestCase): + def setUp(self): + self.set_val = None + self.frozenset_val = None + if not hasattr(__builtin__, 'set'): + __builtin__.set = self.set_val + else: + self.set_val = __builtin__.set + if not hasattr(__builtin__, 'frozenset'): + __builtin__.frozenset = self.frozenset_val + else: + self.frozenset_val = __builtin__.frozenset + + def tearDown(self): + if self.frozenset_val == None: + del(__builtin__.frozenset) + if self.set_val == None: + del(__builtin__.set) + + def test_sets_dont_overwrite(self): + '''Test that importing sets when there's already a set and frozenset defined does not overwrite + ''' + sets.add_builtin_set() + tools.ok_(__builtin__.set == self.set_val) + tools.ok_(__builtin__.frozenset == self.frozenset_val) + +class TestDefineSets(unittest.TestCase): + def setUp(self): + warnings.simplefilter('ignore', DeprecationWarning) + self.set_val = None + self.frozenset_val = None + if hasattr(__builtin__, 'set'): + self.set_val = __builtin__.set + del(__builtin__.set) + if hasattr(__builtin__, 'frozenset'): + self.frozenset_val = __builtin__.frozenset + del(__builtin__.frozenset) + + def tearDown(self): + warnings.simplefilter('default', DeprecationWarning) + if self.set_val: + __builtin__.set = self.set_val + else: + del(__builtin__.set) + if self.frozenset_val: + __builtin__.frozenset = self.frozenset_val + else: + del(__builtin__.frozenset) + + def test_pycompat_defines_set(self): + '''Test that calling pycompat24.add_builtin_set() adds set and frozenset to __builtin__ + ''' + import sets as py_sets + sets.add_builtin_set() + if self.set_val: + tools.ok_(__builtin__.set == self.set_val) + tools.ok_(__builtin__.frozenset == self.frozenset_val) + else: + tools.ok_(__builtin__.set == py_sets.Set) + tools.ok_(__builtin__.frozenset == py_sets.ImmutableSet) + +class TestSubprocess(unittest.TestCase): + pass + +class TestBase64(unittest.TestCase): + b_byte_chars = ' '.join(map(chr, range(0, 256))) + b_byte_encoded = 'ACABIAIgAyAEIAUgBiAHIAggCSAKIAsgDCANIA4gDyAQIBEgEiATIBQgFSAWIBcgGCAZIBogGyAcIB0gHiAfICAgISAiICMgJCAlICYgJyAoICkgKiArICwgLSAuIC8gMCAxIDIgMyA0IDUgNiA3IDggOSA6IDsgPCA9ID4gPyBAIEEgQiBDIEQgRSBGIEcgSCBJIEogSyBMIE0gTiBPIFAgUSBSIFMgVCBVIFYgVyBYIFkgWiBbIFwgXSBeIF8gYCBhIGIgYyBkIGUgZiBnIGggaSBqIGsgbCBtIG4gbyBwIHEgciBzIHQgdSB2IHcgeCB5IHogeyB8IH0gfiB/IIAggSCCIIMghCCFIIYghyCIIIkgiiCLIIwgjSCOII8gkCCRIJIgkyCUIJUgliCXIJggmSCaIJsgnCCdIJ4gnyCgIKEgoiCjIKQgpSCmIKcgqCCpIKogqyCsIK0griCvILAgsSCyILMgtCC1ILYgtyC4ILkguiC7ILwgvSC+IL8gwCDBIMIgwyDEIMUgxiDHIMggySDKIMsgzCDNIM4gzyDQINEg0iDTINQg1SDWINcg2CDZINog2yDcIN0g3iDfIOAg4SDiIOMg5CDlIOYg5yDoIOkg6iDrIOwg7SDuIO8g8CDxIPIg8yD0IPUg9iD3IPgg+SD6IPsg/CD9IP4g/w==' + b_byte_encoded_urlsafe = 'ACABIAIgAyAEIAUgBiAHIAggCSAKIAsgDCANIA4gDyAQIBEgEiATIBQgFSAWIBcgGCAZIBogGyAcIB0gHiAfICAgISAiICMgJCAlICYgJyAoICkgKiArICwgLSAuIC8gMCAxIDIgMyA0IDUgNiA3IDggOSA6IDsgPCA9ID4gPyBAIEEgQiBDIEQgRSBGIEcgSCBJIEogSyBMIE0gTiBPIFAgUSBSIFMgVCBVIFYgVyBYIFkgWiBbIFwgXSBeIF8gYCBhIGIgYyBkIGUgZiBnIGggaSBqIGsgbCBtIG4gbyBwIHEgciBzIHQgdSB2IHcgeCB5IHogeyB8IH0gfiB_IIAggSCCIIMghCCFIIYghyCIIIkgiiCLIIwgjSCOII8gkCCRIJIgkyCUIJUgliCXIJggmSCaIJsgnCCdIJ4gnyCgIKEgoiCjIKQgpSCmIKcgqCCpIKogqyCsIK0griCvILAgsSCyILMgtCC1ILYgtyC4ILkguiC7ILwgvSC-IL8gwCDBIMIgwyDEIMUgxiDHIMggySDKIMsgzCDNIM4gzyDQINEg0iDTINQg1SDWINcg2CDZINog2yDcIN0g3iDfIOAg4SDiIOMg5CDlIOYg5yDoIOkg6iDrIOwg7SDuIO8g8CDxIPIg8yD0IPUg9iD3IPgg-SD6IPsg_CD9IP4g_w==' + + def test_base64_encode(self): + tools.ok_(base64.b64encode(self.b_byte_chars) == self.b_byte_encoded) + tools.ok_(base64.b64encode(self.b_byte_chars, altchars='-_') == self.b_byte_encoded_urlsafe) + tools.ok_(base64.standard_b64encode(self.b_byte_chars) == self.b_byte_encoded) + tools.ok_(base64.urlsafe_b64encode(self.b_byte_chars) == self.b_byte_encoded_urlsafe) + + tools.ok_(base64.b64encode(self.b_byte_chars) == self.b_byte_encoded) + tools.ok_(base64.b64encode(self.b_byte_chars, altchars='-_') == self.b_byte_encoded_urlsafe) + tools.ok_(base64.standard_b64encode(self.b_byte_chars) == self.b_byte_encoded) + tools.ok_(base64.urlsafe_b64encode(self.b_byte_chars) == self.b_byte_encoded_urlsafe) + + def test_base64_decode(self): + tools.ok_(base64.b64decode(self.b_byte_encoded) == self.b_byte_chars) + tools.ok_(base64.b64decode(self.b_byte_encoded_urlsafe, altchars='-_') == self.b_byte_chars) + tools.ok_(base64.standard_b64decode(self.b_byte_encoded) == self.b_byte_chars) + tools.ok_(base64.urlsafe_b64decode(self.b_byte_encoded_urlsafe) == self.b_byte_chars) + + tools.ok_(base64.b64decode(self.b_byte_encoded) == self.b_byte_chars) + tools.ok_(base64.b64decode(self.b_byte_encoded_urlsafe, altchars='-_') == self.b_byte_chars) + tools.ok_(base64.standard_b64decode(self.b_byte_encoded) == self.b_byte_chars) + tools.ok_(base64.urlsafe_b64decode(self.b_byte_encoded_urlsafe) == self.b_byte_chars) + + def test_base64_stdlib_compat(self): + if not hasattr(py_b64, 'b64encode'): + raise SkipTest('Python-2.3 doesn\'t have b64encode to compare against') + tools.ok_(base64.b64encode(self.b_byte_chars) == py_b64.b64encode(self.b_byte_chars)) + tools.ok_(base64.b64decode(self.b_byte_chars) == py_b64.b64decode(self.b_byte_chars)) diff --git a/tests/test_subprocess.py b/tests/test_subprocess.py new file mode 100644 index 0000000..4a55f8c --- /dev/null +++ b/tests/test_subprocess.py @@ -0,0 +1,1457 @@ +import unittest +from nose.plugins.skip import SkipTest +from test import test_support +from kitchen.pycompat27.subprocess import _subprocess as subprocess +import sys +import StringIO +import signal +import os +import errno +import tempfile +import time +import re +# Not available on python2.6 or less +#import sysconfig + +mswindows = (sys.platform == "win32") + +# +# Depends on the following external programs: Python +# + +if mswindows: + SETBINARY = ('import msvcrt; msvcrt.setmode(sys.stdout.fileno(), ' + 'os.O_BINARY);') +else: + SETBINARY = '' + +def reap_children(): + """Use this function at the end of test_main() whenever sub-processes + are started. This will help ensure that no extra children (zombies) + stick around to hog resources and create problems when looking + for refleaks. + """ + + # Reap all our dead child processes so we don't leave zombies around. + # These hog resources and might be causing some of the buildbots to die. + if hasattr(os, 'waitpid'): + any_process = -1 + while True: + try: + # This will raise an exception on Windows. That's ok. + pid, status = os.waitpid(any_process, os.WNOHANG) + if pid == 0: + break + except: + break + +if not hasattr(test_support, 'reap_children'): + # No reap_children in python-2.3 + test_support.reap_children = reap_children + +# In a debug build, stuff like "[6580 refs]" is printed to stderr at +# shutdown time. That frustrates tests trying to check stderr produced +# from a spawned Python process. +def remove_stderr_debug_decorations(stderr): + return re.sub(r"\[\d+ refs\]\r?\n?$", "", stderr) + +try: + mkstemp = tempfile.mkstemp +except AttributeError: + # tempfile.mkstemp is not available + def mkstemp(): + """Replacement for mkstemp, calling mktemp.""" + fname = tempfile.mktemp() + return os.open(fname, os.O_RDWR|os.O_CREAT), fname + + +class BaseTestCase(unittest.TestCase): + def __init__(self, *args, **kwargs): + unittest.TestCase.__init__(self, *args, **kwargs) + if not hasattr(self, '_cleanups'): + self._cleanups = [] + if not hasattr(self, 'addCleanup'): + self.addCleanup = self._addCleanup + + def _addCleanup(self, function, *args, **kwargs): + self._cleanups.append((function, args, kwargs)) + + def setUp(self): + # Try to minimize the number of children we have so this test + # doesn't crash on some buildbots (Alphas in particular). + test_support.reap_children() + + def tearDown(self): + for inst in subprocess._active: + inst.wait() + subprocess._cleanup() + # assertFalse is not available in python-2.3 + self.failIf(subprocess._active, "subprocess._active not empty") + + if not hasattr(self, 'doCleanups'): + ok = True + while self._cleanups: + function, args, kwargs = self._cleanups.pop(-1) + try: + function(*args, **kwargs) + except Exception: + ok = False + + def assertStderrEqual(self, stderr, expected, msg=None): + # In a debug build, stuff like "[6580 refs]" is printed to stderr at + # shutdown time. That frustrates tests trying to check stderr produced + # from a spawned Python process. + actual = re.sub(r"\[\d+ refs\]\r?\n?$", "", stderr) + self.assertEqual(actual, expected, msg) + + +class ProcessTestCase(BaseTestCase): + + def test_call_seq(self): + # call() function with sequence argument + rc = subprocess.call([sys.executable, "-c", + "import sys; sys.exit(47)"]) + self.assertEqual(rc, 47) + + def test_check_call_zero(self): + # check_call() function with zero return code + rc = subprocess.check_call([sys.executable, "-c", + "import sys; sys.exit(0)"]) + self.assertEqual(rc, 0) + + def test_check_call_nonzero(self): + # check_call() function with non-zero return code + #with self.assertRaises(subprocess.CalledProcessError) as c: + try: + subprocess.check_call([sys.executable, "-c", + "import sys; sys.exit(47)"]) + #self.assertEqual(c.exception.returncode, 47) + except subprocess.CalledProcessError, e: + self.assertEqual(e.returncode, 47) + else: + self.fail("Expected CalledProcessError") + + def test_check_output(self): + # check_output() function with zero return code + output = subprocess.check_output( + [sys.executable, "-c", "print 'BDFL'"]) + #self.assertIn('BDFL', output) + self.assert_('BDFL' in output) + + def test_check_output_nonzero(self): + # check_call() function with non-zero return code + #with self.assertRaises(subprocess.CalledProcessError) as c: + try: + subprocess.check_output( + [sys.executable, "-c", "import sys; sys.exit(5)"]) + #self.assertEqual(c.exception.returncode, 5) + except subprocess.CalledProcessError, e: + self.assertEqual(e.returncode, 5) + else: + self.fail("Expected CalledProcessError") + + def test_check_output_stderr(self): + # check_output() function stderr redirected to stdout + output = subprocess.check_output( + [sys.executable, "-c", "import sys; sys.stderr.write('BDFL')"], + stderr=subprocess.STDOUT) + #self.assertIn('BDFL', output) + self.assert_('BDFL' in output) + + def test_check_output_stdout_arg(self): + # check_output() function stderr redirected to stdout + #with self.assertRaises(ValueError) as c: + try: + output = subprocess.check_output( + [sys.executable, "-c", "print 'will not be run'"], + stdout=sys.stdout) + #self.fail("Expected ValueError when stdout arg supplied.") + #self.assertIn('stdout', c.exception.args[0]) + except ValueError, e: + #self.assertIn('stdout', e.args[0]) + self.assert_('stdout' in e.args[0]) + else: + self.fail("Expected ValueError when stdout arg supplied.") + + def test_call_kwargs(self): + # call() function with keyword args + newenv = os.environ.copy() + newenv["FRUIT"] = "banana" + rc = subprocess.call([sys.executable, "-c", + 'import sys, os;' + 'sys.exit(os.getenv("FRUIT")=="banana")'], + env=newenv) + self.assertEqual(rc, 1) + + def test_invalid_args(self): + # Popen() called with invalid arguments should raise TypeError + # but Popen.__del__ should not complain (issue #12085) + #with test_support.captured_stderr() as s: + orig_stderr = getattr(sys, 'stderr') + setattr(sys, 'stderr', StringIO.StringIO()) + s = sys.stderr + try: + try: + subprocess.Popen(invalid_arg_name=1) + except TypeError: + pass + except: + self.fail("Expected TypeError") + else: + self.fail("Expected TypeError") + argcount = subprocess.Popen.__init__.func_code.co_argcount + too_many_args = [0] * (argcount + 1) + try: + subprocess.Popen(*too_many_args) + except TypeError: + pass + except: + self.fail("Expected TypeError") + else: + self.fail("Expected TypeError") + finally: + setattr(sys,'stderr', orig_stderr) + self.assertEqual(s.getvalue(), '') + + def test_stdin_none(self): + # .stdin is None when not redirected + p = subprocess.Popen([sys.executable, "-c", 'print "banana"'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + p.wait() + self.assertEqual(p.stdin, None) + + def test_stdout_none(self): + # .stdout is None when not redirected + p = subprocess.Popen([sys.executable, "-c", + 'print " this bit of output is from a ' + 'test of stdout in a different ' + 'process ..."'], + stdin=subprocess.PIPE, stderr=subprocess.PIPE) + self.addCleanup(p.stdin.close) + self.addCleanup(p.stderr.close) + p.wait() + self.assertEqual(p.stdout, None) + + def test_stderr_none(self): + # .stderr is None when not redirected + p = subprocess.Popen([sys.executable, "-c", 'print "banana"'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stdin.close) + p.wait() + self.assertEqual(p.stderr, None) + + def test_executable_with_cwd(self): + python_dir = os.path.dirname(os.path.realpath(sys.executable)) + p = subprocess.Popen(["somethingyoudonthave", "-c", + "import sys; sys.exit(47)"], + executable=sys.executable, cwd=python_dir) + p.wait() + self.assertEqual(p.returncode, 47) + + # Not available on python2.3 and we know we're not building python itself + #@unittest.skipIf(sysconfig.is_python_build(), + # "need an installed Python. See #7774") + def test_executable_without_cwd(self): + # For a normal installation, it should work without 'cwd' + # argument. For test runs in the build directory, see #7774. + p = subprocess.Popen(["somethingyoudonthave", "-c", + "import sys; sys.exit(47)"], + executable=sys.executable) + p.wait() + self.assertEqual(p.returncode, 47) + + def test_stdin_pipe(self): + # stdin redirection + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.exit(sys.stdin.read() == "pear")'], + stdin=subprocess.PIPE) + p.stdin.write("pear") + p.stdin.close() + p.wait() + self.assertEqual(p.returncode, 1) + + def test_stdin_filedes(self): + # stdin is set to open file descriptor + tf = tempfile.TemporaryFile() + d = tf.fileno() + os.write(d, "pear") + os.lseek(d, 0, 0) + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.exit(sys.stdin.read() == "pear")'], + stdin=d) + p.wait() + self.assertEqual(p.returncode, 1) + + def test_stdin_fileobj(self): + # stdin is set to open file object + tf = tempfile.TemporaryFile() + tf.write("pear") + tf.seek(0) + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.exit(sys.stdin.read() == "pear")'], + stdin=tf) + p.wait() + self.assertEqual(p.returncode, 1) + + def test_stdout_pipe(self): + # stdout redirection + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stdout.write("orange")'], + stdout=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.assertEqual(p.stdout.read(), "orange") + + def test_stdout_filedes(self): + # stdout is set to open file descriptor + tf = tempfile.TemporaryFile() + d = tf.fileno() + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stdout.write("orange")'], + stdout=d) + p.wait() + os.lseek(d, 0, 0) + self.assertEqual(os.read(d, 1024), "orange") + + def test_stdout_fileobj(self): + # stdout is set to open file object + tf = tempfile.TemporaryFile() + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stdout.write("orange")'], + stdout=tf) + p.wait() + tf.seek(0) + self.assertEqual(tf.read(), "orange") + + def test_stderr_pipe(self): + # stderr redirection + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stderr.write("strawberry")'], + stderr=subprocess.PIPE) + self.addCleanup(p.stderr.close) + #self.assertStderrEqual(p.stderr.read(), "strawberry") + self.assertEqual(remove_stderr_debug_decorations(p.stderr.read()), + "strawberry") + + def test_stderr_filedes(self): + # stderr is set to open file descriptor + tf = tempfile.TemporaryFile() + d = tf.fileno() + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stderr.write("strawberry")'], + stderr=d) + p.wait() + os.lseek(d, 0, 0) + #self.assertStderrEqual(os.read(d, 1024), "strawberry") + self.assertEqual(remove_stderr_debug_decorations(os.read(d, 1024)), + "strawberry") + + def test_stderr_fileobj(self): + # stderr is set to open file object + tf = tempfile.TemporaryFile() + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stderr.write("strawberry")'], + stderr=tf) + p.wait() + tf.seek(0) + #self.assertStderrEqual(tf.read(), "strawberry") + self.assertEqual(remove_stderr_debug_decorations(tf.read()), + "strawberry") + + def test_stdout_stderr_pipe(self): + # capture stdout and stderr to the same pipe + p = subprocess.Popen([sys.executable, "-c", + 'import sys;' + 'sys.stdout.write("apple");' + 'sys.stdout.flush();' + 'sys.stderr.write("orange")'], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + self.addCleanup(p.stdout.close) + #self.assertStderrEqual(p.stdout.read(), "appleorange") + output = p.stdout.read() + stripped = remove_stderr_debug_decorations(output) + self.assertEqual(stripped, "appleorange") + + def test_stdout_stderr_file(self): + # capture stdout and stderr to the same open file + tf = tempfile.TemporaryFile() + p = subprocess.Popen([sys.executable, "-c", + 'import sys;' + 'sys.stdout.write("apple");' + 'sys.stdout.flush();' + 'sys.stderr.write("orange")'], + stdout=tf, + stderr=tf) + p.wait() + tf.seek(0) + #self.assertStderrEqual(tf.read(), "appleorange") + output = tf.read() + stripped = remove_stderr_debug_decorations(output) + self.assertEqual(stripped, "appleorange") + + def test_stdout_filedes_of_stdout(self): + # stdout is set to 1 (#1531862). + cmd = r"import sys, os; sys.exit(os.write(sys.stdout.fileno(), '.\n'))" + rc = subprocess.call([sys.executable, "-c", cmd], stdout=1) + self.assertEqual(rc, 2) + + def test_cwd(self): + tmpdir = tempfile.gettempdir() + # We cannot use os.path.realpath to canonicalize the path, + # since it doesn't expand Tru64 {memb} strings. See bug 1063571. + cwd = os.getcwd() + os.chdir(tmpdir) + tmpdir = os.getcwd() + os.chdir(cwd) + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + 'sys.stdout.write(os.getcwd())'], + stdout=subprocess.PIPE, + cwd=tmpdir) + self.addCleanup(p.stdout.close) + normcase = os.path.normcase + self.assertEqual(normcase(p.stdout.read()), normcase(tmpdir)) + + def test_env(self): + newenv = os.environ.copy() + newenv["FRUIT"] = "orange" + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + 'sys.stdout.write(os.getenv("FRUIT"))'], + stdout=subprocess.PIPE, + env=newenv) + self.assertEqual(p.stdout.read(), "orange") + + def test_communicate_stdin(self): + p = subprocess.Popen([sys.executable, "-c", + 'import sys;' + 'sys.exit(sys.stdin.read() == "pear")'], + stdin=subprocess.PIPE) + p.communicate("pear") + self.assertEqual(p.returncode, 1) + + def test_communicate_stdout(self): + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stdout.write("pineapple")'], + stdout=subprocess.PIPE) + (stdout, stderr) = p.communicate() + self.assertEqual(stdout, "pineapple") + self.assertEqual(stderr, None) + + def test_communicate_stderr(self): + p = subprocess.Popen([sys.executable, "-c", + 'import sys; sys.stderr.write("pineapple")'], + stderr=subprocess.PIPE) + (stdout, stderr) = p.communicate() + self.assertEqual(stdout, None) + #self.assertStderrEqual(stderr, "pineapple") + self.assertEqual(remove_stderr_debug_decorations(stderr), "pineapple") + + def test_communicate(self): + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + 'sys.stderr.write("pineapple");' + 'sys.stdout.write(sys.stdin.read())'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + self.addCleanup(p.stdin.close) + (stdout, stderr) = p.communicate("banana") + self.assertEqual(stdout, "banana") + #self.assertStderrEqual(stderr, "pineapple") + self.assertEqual(remove_stderr_debug_decorations(stderr), + "pineapple") + + # Not available with python-2.6's unittest: Reimplement by + # raising SkipTest from nose + # This test is Linux specific for simplicity to at least have + # some coverage. It is not a platform specific bug. + #@unittest.skipUnless(os.path.isdir('/proc/%d/fd' % os.getpid()), + # "Linux specific") + # Test for the fd leak reported in http://bugs.python.org/issue2791. + def test_communicate_pipe_fd_leak(self): + if not os.path.isdir('/proc/%d/fd' % os.getpid()): + raise SkipTest('Linux specific') + fd_directory = '/proc/%d/fd' % os.getpid() + num_fds_before_popen = len(os.listdir(fd_directory)) + p = subprocess.Popen([sys.executable, "-c", "print()"], + stdout=subprocess.PIPE) + p.communicate() + num_fds_after_communicate = len(os.listdir(fd_directory)) + del p + num_fds_after_destruction = len(os.listdir(fd_directory)) + self.assertEqual(num_fds_before_popen, num_fds_after_destruction) + self.assertEqual(num_fds_before_popen, num_fds_after_communicate) + + def test_communicate_returns(self): + # communicate() should return None if no redirection is active + p = subprocess.Popen([sys.executable, "-c", + "import sys; sys.exit(47)"]) + (stdout, stderr) = p.communicate() + self.assertEqual(stdout, None) + self.assertEqual(stderr, None) + + def test_communicate_pipe_buf(self): + # communicate() with writes larger than pipe_buf + # This test will probably deadlock rather than fail, if + # communicate() does not work properly. + x, y = os.pipe() + if mswindows: + pipe_buf = 512 + else: + pipe_buf = os.fpathconf(x, "PC_PIPE_BUF") + os.close(x) + os.close(y) + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + 'sys.stdout.write(sys.stdin.read(47));' + 'sys.stderr.write("xyz"*%d);' + 'sys.stdout.write(sys.stdin.read())' % pipe_buf], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + self.addCleanup(p.stdin.close) + string_to_write = "abc"*pipe_buf + (stdout, stderr) = p.communicate(string_to_write) + self.assertEqual(stdout, string_to_write) + + def test_writes_before_communicate(self): + # stdin.write before communicate() + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + 'sys.stdout.write(sys.stdin.read())'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + self.addCleanup(p.stdin.close) + p.stdin.write("banana") + (stdout, stderr) = p.communicate("split") + self.assertEqual(stdout, "bananasplit") + #self.assertStderrEqual(stderr, "") + self.assertEqual(remove_stderr_debug_decorations(stderr), "") + + def test_universal_newlines(self): + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + SETBINARY + + 'sys.stdout.write("line1\\n");' + 'sys.stdout.flush();' + 'sys.stdout.write("line2\\r");' + 'sys.stdout.flush();' + 'sys.stdout.write("line3\\r\\n");' + 'sys.stdout.flush();' + 'sys.stdout.write("line4\\r");' + 'sys.stdout.flush();' + 'sys.stdout.write("\\nline5");' + 'sys.stdout.flush();' + 'sys.stdout.write("\\nline6");'], + stdout=subprocess.PIPE, + universal_newlines=1) + self.addCleanup(p.stdout.close) + stdout = p.stdout.read() + if hasattr(file, 'newlines'): + # Interpreter with universal newline support + self.assertEqual(stdout, + "line1\nline2\nline3\nline4\nline5\nline6") + else: + # Interpreter without universal newline support + self.assertEqual(stdout, + "line1\nline2\rline3\r\nline4\r\nline5\nline6") + + def test_universal_newlines_communicate(self): + # universal newlines through communicate() + p = subprocess.Popen([sys.executable, "-c", + 'import sys,os;' + SETBINARY + + 'sys.stdout.write("line1\\n");' + 'sys.stdout.flush();' + 'sys.stdout.write("line2\\r");' + 'sys.stdout.flush();' + 'sys.stdout.write("line3\\r\\n");' + 'sys.stdout.flush();' + 'sys.stdout.write("line4\\r");' + 'sys.stdout.flush();' + 'sys.stdout.write("\\nline5");' + 'sys.stdout.flush();' + 'sys.stdout.write("\\nline6");'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + universal_newlines=1) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + (stdout, stderr) = p.communicate() + if hasattr(file, 'newlines'): + # Interpreter with universal newline support + self.assertEqual(stdout, + "line1\nline2\nline3\nline4\nline5\nline6") + else: + # Interpreter without universal newline support + self.assertEqual(stdout, + "line1\nline2\rline3\r\nline4\r\nline5\nline6") + + def test_no_leaking(self): + # Make sure we leak no resources + if not mswindows: + max_handles = 1026 # too much for most UNIX systems + else: + max_handles = 2050 # too much for (at least some) Windows setups + handles = [] + try: + for i in range(max_handles): + try: + handles.append(os.open(test_support.TESTFN, + os.O_WRONLY | os.O_CREAT)) + except OSError, e: + if e.errno != errno.EMFILE: + raise + break + else: + # python-2.3 unittest doesn't have skipTest. Reimplement with nose + #self.skipTest("failed to reach the file descriptor limit " + # "(tried %d)" % max_handles) + raise SkipTest("failed to reach the file descriptor limit " + "(tried %d)" % max_handles) + + # Close a couple of them (should be enough for a subprocess) + for i in range(10): + os.close(handles.pop()) + # Loop creating some subprocesses. If one of them leaks some fds, + # the next loop iteration will fail by reaching the max fd limit. + for i in range(15): + p = subprocess.Popen([sys.executable, "-c", + "import sys;" + "sys.stdout.write(sys.stdin.read())"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + data = p.communicate("lime")[0] + self.assertEqual(data, "lime") + finally: + for h in handles: + os.close(h) + + def test_list2cmdline(self): + self.assertEqual(subprocess.list2cmdline(['a b c', 'd', 'e']), + '"a b c" d e') + self.assertEqual(subprocess.list2cmdline(['ab"c', '\\', 'd']), + 'ab\\"c \\ d') + self.assertEqual(subprocess.list2cmdline(['ab"c', ' \\', 'd']), + 'ab\\"c " \\\\" d') + self.assertEqual(subprocess.list2cmdline(['a\\\\\\b', 'de fg', 'h']), + 'a\\\\\\b "de fg" h') + self.assertEqual(subprocess.list2cmdline(['a\\"b', 'c', 'd']), + 'a\\\\\\"b c d') + self.assertEqual(subprocess.list2cmdline(['a\\\\b c', 'd', 'e']), + '"a\\\\b c" d e') + self.assertEqual(subprocess.list2cmdline(['a\\\\b\\ c', 'd', 'e']), + '"a\\\\b\\ c" d e') + self.assertEqual(subprocess.list2cmdline(['ab', '']), + 'ab ""') + + + def test_poll(self): + p = subprocess.Popen([sys.executable, + "-c", "import time; time.sleep(1)"]) + count = 0 + while p.poll() is None: + time.sleep(0.1) + count += 1 + # We expect that the poll loop probably went around about 10 times, + # but, based on system scheduling we can't control, it's possible + # poll() never returned None. It "should be" very rare that it + # didn't go around at least twice. + #self.assertGreaterEqual(count, 2) + self.assert_(count >= 2) + # Subsequent invocations should just return the returncode + self.assertEqual(p.poll(), 0) + + + def test_wait(self): + p = subprocess.Popen([sys.executable, + "-c", "import time; time.sleep(2)"]) + self.assertEqual(p.wait(), 0) + # Subsequent invocations should just return the returncode + self.assertEqual(p.wait(), 0) + + + def test_invalid_bufsize(self): + # an invalid type of the bufsize argument should raise + # TypeError. + #with self.assertRaises(TypeError): + try: + subprocess.Popen([sys.executable, "-c", "pass"], "orange") + except TypeError: + pass + else: + self.fail("Expected TypeError") + + def test_leaking_fds_on_error(self): + # see bug #5179: Popen leaks file descriptors to PIPEs if + # the child fails to execute; this will eventually exhaust + # the maximum number of open fds. 1024 seems a very common + # value for that limit, but Windows has 2048, so we loop + # 1024 times (each call leaked two fds). + for i in range(1024): + # Windows raises IOError. Others raise OSError. + #with self.assertRaises(EnvironmentError) as c: + try: + subprocess.Popen(['nonexisting_i_hope'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + #if c.exception.errno != 2: # ignore "no such file" + # raise c.exception + # Windows raises IOError + except EnvironmentError, err: + if err.errno not in (errno.ENOENT, errno.EACCES): # ignore "no such file" + raise err + except: + self.fail("Expected EnvironmentError") + else: + self.fail("Expected EnvironmentError") + + def test_handles_closed_on_exception(self): + # If CreateProcess exits with an error, ensure the + # duplicate output handles are released + ifhandle, ifname = mkstemp() + ofhandle, ofname = mkstemp() + efhandle, efname = mkstemp() + try: + subprocess.Popen (["*"], stdin=ifhandle, stdout=ofhandle, + stderr=efhandle) + except OSError: + os.close(ifhandle) + os.remove(ifname) + os.close(ofhandle) + os.remove(ofname) + os.close(efhandle) + os.remove(efname) + self.assert_(not os.path.exists(ifname)) + self.assert_(not os.path.exists(ofname)) + self.assert_(not os.path.exists(efname)) + + def test_communicate_epipe(self): + # Issue 10963: communicate() should hide EPIPE + p = subprocess.Popen([sys.executable, "-c", 'pass'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + self.addCleanup(p.stdin.close) + p.communicate("x" * 2**20) + + def test_communicate_epipe_only_stdin(self): + # Issue 10963: communicate() should hide EPIPE + p = subprocess.Popen([sys.executable, "-c", 'pass'], + stdin=subprocess.PIPE) + self.addCleanup(p.stdin.close) + time.sleep(2) + p.communicate("x" * 2**20) + +# context manager +class _SuppressCoreFiles(object): + """Try to prevent core files from being created.""" + old_limit = None + + def __enter__(self): + """Try to save previous ulimit, then set it to (0, 0).""" + try: + import resource + self.old_limit = resource.getrlimit(resource.RLIMIT_CORE) + resource.setrlimit(resource.RLIMIT_CORE, (0, 0)) + except (ImportError, ValueError, resource.error): + pass + + if sys.platform == 'darwin': + # Check if the 'Crash Reporter' on OSX was configured + # in 'Developer' mode and warn that it will get triggered + # when it is. + # + # This assumes that this context manager is used in tests + # that might trigger the next manager. + value = subprocess.Popen(['/usr/bin/defaults', 'read', + 'com.apple.CrashReporter', 'DialogType'], + stdout=subprocess.PIPE).communicate()[0] + if value.strip() == 'developer': + print "this tests triggers the Crash Reporter, that is intentional" + sys.stdout.flush() + + def __exit__(self, *args): + """Return core file behavior to default.""" + if self.old_limit is None: + return + try: + import resource + resource.setrlimit(resource.RLIMIT_CORE, self.old_limit) + except (ImportError, ValueError, resource.error): + pass + + # python-2.3 unittest doesn't have skipUnless. Reimplement as SkipTest from nose + #@unittest.skipUnless(hasattr(signal, 'SIGALRM'), + # "Requires signal.SIGALRM") + def test_communicate_eintr(self): + if not hasattr(signal, 'SIGALRM'): + raise SkipTest('Requires signal.SIGALRM') + # Issue #12493: communicate() should handle EINTR + def handler(signum, frame): + pass + old_handler = signal.signal(signal.SIGALRM, handler) + self.addCleanup(signal.signal, signal.SIGALRM, old_handler) + + # the process is running for 2 seconds + args = [sys.executable, "-c", 'import time; time.sleep(2)'] + for stream in ('stdout', 'stderr'): + kw = {stream: subprocess.PIPE} + #with subprocess.Popen(args, **kw) as process: + try: + process = subprocess.Popen(args, **kw) + signal.alarm(1) + # communicate() will be interrupted by SIGALRM + process.communicate() + finally: + process.close() + + +# Not available with python-2.3's unittest. Reimplement with SkipTest from +# nose +#@unittest.skipIf(mswindows, "POSIX specific tests") +class POSIXProcessTestCase(BaseTestCase): + def setUp(self): + if mswindows: + raise SkipTest('POSIX specific tests') + + def test_exceptions(self): + # caught & re-raised exceptions + #with self.assertRaises(OSError) as c: + try: + p = subprocess.Popen([sys.executable, "-c", ""], + cwd="/this/path/does/not/exist") + # The attribute child_traceback should contain "os.chdir" somewhere. + #self.assertIn("os.chdir", c.exception.child_traceback) + except OSError, e: + self.assertNotEqual(e.child_traceback.find("os.chdir"), -1) + except: + self.fail("Expected OSError") + else: + self.fail("Expected OSError") + + def _suppress_core_files(self): + """Try to prevent core files from being created. + Returns previous ulimit if successful, else None. + """ + try: + import resource + old_limit = resource.getrlimit(resource.RLIMIT_CORE) + resource.setrlimit(resource.RLIMIT_CORE, (0,0)) + return old_limit + except (ImportError, ValueError, resource.error): + return None + + def _unsuppress_core_files(self, old_limit): + """Return core file behavior to default.""" + if old_limit is None: + return + try: + import resource + resource.setrlimit(resource.RLIMIT_CORE, old_limit) + except (ImportError, ValueError, resource.error): + return + + def test_run_abort(self): + # returncode handles signal termination + #with _SuppressCoreFiles(): + old_limit = self._suppress_core_files() + try: + p = subprocess.Popen([sys.executable, "-c", + "import os; os.abort()"]) + finally: + self._unsuppress_core_files(old_limit) + + #p.wait() + p.wait() + self.assertEqual(-p.returncode, signal.SIGABRT) + + def test_preexec(self): + # preexec function + p = subprocess.Popen([sys.executable, "-c", + "import sys, os;" + "sys.stdout.write(os.getenv('FRUIT'))"], + stdout=subprocess.PIPE, + preexec_fn=lambda: os.putenv("FRUIT", "apple")) + self.addCleanup(p.stdout.close) + self.assertEqual(p.stdout.read(), "apple") + + def test_args_string(self): + # args is a string + f, fname = mkstemp() + os.write(f, "#!/bin/sh\n") + os.write(f, "exec '%s' -c 'import sys; sys.exit(47)'\n" % + sys.executable) + os.close(f) + # 0o is not available in python2.5 + #os.chmod(fname, 0o700) + os.chmod(fname, 0700) + p = subprocess.Popen(fname) + p.wait() + os.remove(fname) + self.assertEqual(p.returncode, 47) + + def test_invalid_args(self): + # invalid arguments should raise ValueError + self.assertRaises(ValueError, subprocess.call, + [sys.executable, "-c", + "import sys; sys.exit(47)"], + startupinfo=47) + self.assertRaises(ValueError, subprocess.call, + [sys.executable, "-c", + "import sys; sys.exit(47)"], + creationflags=47) + + def test_shell_sequence(self): + # Run command through the shell (sequence) + newenv = os.environ.copy() + newenv["FRUIT"] = "apple" + p = subprocess.Popen(["echo $FRUIT"], shell=1, + stdout=subprocess.PIPE, + env=newenv) + self.addCleanup(p.stdout.close) + self.assertEqual(p.stdout.read().strip(), "apple") + + def test_shell_string(self): + # Run command through the shell (string) + newenv = os.environ.copy() + newenv["FRUIT"] = "apple" + p = subprocess.Popen("echo $FRUIT", shell=1, + stdout=subprocess.PIPE, + env=newenv) + self.addCleanup(p.stdout.close) + self.assertEqual(p.stdout.read().strip(), "apple") + + def test_call_string(self): + # call() function with string argument on UNIX + f, fname = mkstemp() + os.write(f, "#!/bin/sh\n") + os.write(f, "exec '%s' -c 'import sys; sys.exit(47)'\n" % + sys.executable) + os.close(f) + os.chmod(fname, 0700) + rc = subprocess.call(fname) + os.remove(fname) + self.assertEqual(rc, 47) + + def test_specific_shell(self): + # Issue #9265: Incorrect name passed as arg[0]. + shells = [] + for prefix in ['/bin', '/usr/bin/', '/usr/local/bin']: + for name in ['bash', 'ksh']: + sh = os.path.join(prefix, name) + if os.path.isfile(sh): + shells.append(sh) + if not shells: # Will probably work for any shell but csh. + + # skipTest unavailable on python<2.7 reimplement with nose + #self.skipTest("bash or ksh required for this test") + raise SkipTest("bash or ksh required for this test") + sh = '/bin/sh' + if os.path.isfile(sh) and not os.path.islink(sh): + # Test will fail if /bin/sh is a symlink to csh. + shells.append(sh) + for sh in shells: + p = subprocess.Popen("echo $0", executable=sh, shell=True, + stdout=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.assertEqual(p.stdout.read().strip(), sh) + + def _kill_process(self, method, *args): + # Do not inherit file handles from the parent. + # It should fix failures on some platforms. + p = subprocess.Popen([sys.executable, "-c", """if 1: + import sys, time + sys.stdout.write('x\\n') + sys.stdout.flush() + time.sleep(30) + """], + close_fds=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + # Wait for the interpreter to be completely initialized before + # sending any signal. + p.stdout.read(1) + getattr(p, method)(*args) + return p + + # These are still hanging with x86_64 Fedora 12 (python-2.6) subprocess + # backport from current trunk run under nosetests + + def test_send_signal(self): + #if hang DISABLED #2777 + p = self._kill_process('send_signal', signal.SIGINT) + _, stderr = p.communicate() + self.assert_('KeyboardInterrupt' in stderr) + self.assertNotEqual(p.wait(), 0) + + def test_kill(self): + #if hang DISABLED #2777 + p = self._kill_process('kill') + _, stderr = p.communicate() + self.assertStderrEqual(stderr, '') + self.assertEqual(p.wait(), -signal.SIGKILL) + + def test_terminate(self): + #if hang DISABLED #2777 + p = self._kill_process('terminate') + _, stderr = p.communicate() + self.assertStderrEqual(stderr, '') + self.assertEqual(p.wait(), -signal.SIGTERM) + + def check_close_std_fds(self, fds): + # Issue #9905: test that subprocess pipes still work properly with + # some standard fds closed + stdin = 0 + newfds = [] + for a in fds: + b = os.dup(a) + newfds.append(b) + if a == 0: + stdin = b + try: + for fd in fds: + os.close(fd) + out, err = subprocess.Popen([sys.executable, "-c", + 'import sys;' + 'sys.stdout.write("apple");' + 'sys.stdout.flush();' + 'sys.stderr.write("orange")'], + stdin=stdin, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE).communicate() + err = re.sub(r"\[\d+ refs\]\r?\n?$", "", err).strip() + self.assertEqual((out, err), ('apple', 'orange')) + finally: + for b, a in zip(newfds, fds): + os.dup2(b, a) + for b in newfds: + os.close(b) + + def test_close_fd_0(self): + self.check_close_std_fds([0]) + + def test_close_fd_1(self): + self.check_close_std_fds([1]) + + def test_close_fd_2(self): + self.check_close_std_fds([2]) + + def test_close_fds_0_1(self): + self.check_close_std_fds([0, 1]) + + def test_close_fds_0_2(self): + self.check_close_std_fds([0, 2]) + + def test_close_fds_1_2(self): + self.check_close_std_fds([1, 2]) + + def test_close_fds_0_1_2(self): + # Issue #10806: test that subprocess pipes still work properly with + # all standard fds closed. + self.check_close_std_fds([0, 1, 2]) + + def check_swap_fds(self, stdin_no, stdout_no, stderr_no): + # open up some temporary files + temps = [mkstemp() for i in range(3)] + temp_fds = [fd for fd, fname in temps] + try: + # unlink the files -- we won't need to reopen them + for fd, fname in temps: + os.unlink(fname) + + # save a copy of the standard file descriptors + saved_fds = [os.dup(fd) for fd in range(3)] + try: + # duplicate the temp files over the standard fd's 0, 1, 2 + for fd, temp_fd in enumerate(temp_fds): + os.dup2(temp_fd, fd) + + # write some data to what will become stdin, and rewind + os.write(stdin_no, "STDIN") + os.lseek(stdin_no, 0, 0) + + # now use those files in the given order, so that subprocess + # has to rearrange them in the child + p = subprocess.Popen([sys.executable, "-c", + 'import sys; got = sys.stdin.read();' + 'sys.stdout.write("got %s"%got); sys.stderr.write("err")'], + stdin=stdin_no, + stdout=stdout_no, + stderr=stderr_no) + p.wait() + + for fd in temp_fds: + os.lseek(fd, 0, 0) + + out = os.read(stdout_no, 1024) + err = re.sub(r"\[\d+ refs\]\r?\n?$", "", os.read(stderr_no, 1024)).strip() + finally: + for std, saved in enumerate(saved_fds): + os.dup2(saved, std) + os.close(saved) + + self.assertEqual(out, "got STDIN") + self.assertEqual(err, "err") + + finally: + for fd in temp_fds: + os.close(fd) + + # When duping fds, if there arises a situation where one of the fds is + # either 0, 1 or 2, it is possible that it is overwritten (#12607). + # This tests all combinations of this. + def test_swap_fds(self): + self.check_swap_fds(0, 1, 2) + self.check_swap_fds(0, 2, 1) + self.check_swap_fds(1, 0, 2) + self.check_swap_fds(1, 2, 0) + self.check_swap_fds(2, 0, 1) + self.check_swap_fds(2, 1, 0) + + def test_wait_when_sigchild_ignored(self): + # NOTE: sigchild_ignore.py may not be an effective test on all OSes. + sigchild_ignore = test_support.findfile(os.path.join("subprocessdata", + "sigchild_ignore.py")) + p = subprocess.Popen([sys.executable, sigchild_ignore], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + self.assertEqual(0, p.returncode, "sigchild_ignore.py exited" + " non-zero with this error:\n%s" % stderr) + + + def test_zombie_fast_process_del(self): + # Issue #12650: on Unix, if Popen.__del__() was called before the + # process exited, it wouldn't be added to subprocess._active, and would + # remain a zombie. + # spawn a Popen, and delete its reference before it exits + p = subprocess.Popen([sys.executable, "-c", + 'import sys, time;' + 'time.sleep(0.2)'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + ident = id(p) + pid = p.pid + del p + # check that p is in the active processes list + # assertIn not in python< 2.7 + #self.assertIn(ident, [id(o) for o in subprocess._active]) + self.assert_(ident in [id(o) for o in subprocess._active]) + + def test_leak_fast_process_del_killed(self): + # Issue #12650: on Unix, if Popen.__del__() was called before the + # process exited, and the process got killed by a signal, it would never + # be removed from subprocess._active, which triggered a FD and memory + # leak. + # spawn a Popen, delete its reference and kill it + p = subprocess.Popen([sys.executable, "-c", + 'import time;' + 'time.sleep(3)'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + ident = id(p) + pid = p.pid + del p + os.kill(pid, signal.SIGKILL) + # check that p is in the active processes list + # assertIn not in python < 2.7 + self.assert_(ident in [id(o) for o in subprocess._active]) + #self.assertIn(ident, [id(o) for o in subprocess._active]) + + # let some time for the process to exit, and create a new Popen: this + # should trigger the wait() of p + time.sleep(0.2) + #with self.assertRaises(EnvironmentError) as c: + try: + try: + proc = subprocess.Popen(['nonexisting_i_hope'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + finally: + pass + except EnvironmentError: + pass + except: + self.fail('Expected EnvironmentError') + else: + self.fail('Expected EnvironmentError') + # p should have been wait()ed on, and removed from the _active list + #self.assertRaises(OSError, os.waitpid, pid, 0) + try: + os.waitpid(pid, 0) + except OSError: + pass + except: + self.fail('Expected OSError') + else: + self.fail('Expected OSError') + #self.assertNotIn(ident, [id(o) for o in subprocess._active]) + self.assert_(ident not in [id(o) for o in subprocess._active]) + + def test_pipe_cloexec(self): + # Issue 12786: check that the communication pipes' FDs are set CLOEXEC, + # and are not inherited by another child process. + p1 = subprocess.Popen([sys.executable, "-c", + 'import os;' + 'os.read(0, 1)' + ], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + p2 = subprocess.Popen([sys.executable, "-c", """if True: + import os, errno, sys + for fd in %r: + try: + os.close(fd) + except OSError, e: + if e.errno != errno.EBADF: + raise + else: + sys.exit(1) + sys.exit(0) + """ % [f.fileno() for f in (p1.stdin, p1.stdout, + p1.stderr)] + ], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, close_fds=False) + p1.communicate('foo') + _, stderr = p2.communicate() + + self.assertEqual(p2.returncode, 0, "Unexpected error: " + repr(stderr)) + + +# Not available with python-2.3's unittest, reimplement with SkipTest from +# nose +##@unittest.skipUnless(mswindows, "Windows specific tests") +class Win32ProcessTestCase(BaseTestCase): + def setUp(self): + if not mswindows: + raise SkipTest('Windows specific tests') + + def test_startupinfo(self): + # startupinfo argument + # We uses hardcoded constants, because we do not want to + # depend on win32all. + STARTF_USESHOWWINDOW = 1 + SW_MAXIMIZE = 3 + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags = STARTF_USESHOWWINDOW + startupinfo.wShowWindow = SW_MAXIMIZE + # Since Python is a console process, it won't be affected + # by wShowWindow, but the argument should be silently + # ignored + subprocess.call([sys.executable, "-c", "import sys; sys.exit(0)"], + startupinfo=startupinfo) + + def test_creationflags(self): + # creationflags argument + CREATE_NEW_CONSOLE = 16 + sys.stderr.write(" a DOS box should flash briefly ...\n") + subprocess.call(sys.executable + + ' -c "import time; time.sleep(0.25)"', + creationflags=CREATE_NEW_CONSOLE) + + def test_invalid_args(self): + # invalid arguments should raise ValueError + self.assertRaises(ValueError, subprocess.call, + [sys.executable, "-c", + "import sys; sys.exit(47)"], + preexec_fn=lambda: 1) + self.assertRaises(ValueError, subprocess.call, + [sys.executable, "-c", + "import sys; sys.exit(47)"], + stdout=subprocess.PIPE, + close_fds=True) + + def test_close_fds(self): + # close file descriptors + rc = subprocess.call([sys.executable, "-c", + "import sys; sys.exit(47)"], + close_fds=True) + self.assertEqual(rc, 47) + + def test_shell_sequence(self): + # Run command through the shell (sequence) + newenv = os.environ.copy() + newenv["FRUIT"] = "physalis" + p = subprocess.Popen(["set"], shell=1, + stdout=subprocess.PIPE, + env=newenv) + self.addCleanup(p.stdout.close) + #self.assertIn("physalis", p.stdout.read()) + self.assert_("physalis" in p.stdout.read()) + + def test_shell_string(self): + # Run command through the shell (string) + newenv = os.environ.copy() + newenv["FRUIT"] = "physalis" + p = subprocess.Popen("set", shell=1, + stdout=subprocess.PIPE, + env=newenv) + self.addCleanup(p.stdout.close) + #self.assertIn("physalis", p.stdout.read()) + self.assert_("physalis" in p.stdout.read()) + + def test_call_string(self): + # call() function with string argument on Windows + rc = subprocess.call(sys.executable + + ' -c "import sys; sys.exit(47)"') + self.assertEqual(rc, 47) + + def _kill_process(self, method, *args): + # Some win32 buildbot raises EOFError if stdin is inherited + p = subprocess.Popen([sys.executable, "-c", """if 1: + import sys, time + sys.stdout.write('x\\n') + sys.stdout.flush() + time.sleep(30) + """], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + self.addCleanup(p.stdout.close) + self.addCleanup(p.stderr.close) + self.addCleanup(p.stdin.close) + # Wait for the interpreter to be completely initialized before + # sending any signal. + p.stdout.read(1) + getattr(p, method)(*args) + _, stderr = p.communicate() + self.assertStderrEqual(stderr, '') + returncode = p.wait() + self.assertNotEqual(returncode, 0) + + def test_send_signal(self): + #if hang DISABLED #2777 + self._kill_process('send_signal', signal.SIGTERM) + + def test_kill(self): + #if hang DISABLED #2777 + self._kill_process('kill') + + def test_terminate(self): + #if hang DISABLED #2777 + self._kill_process('terminate') + + +# Not available with python-2.3's unittest. reimplement with SkipTest from +# nose +#@unittest.skipUnless(getattr(subprocess, '_has_poll', False), +# "poll system call not supported") +class ProcessTestCaseNoPoll(ProcessTestCase): + def setUp(self): + if not getattr(subprocess, '_has_poll', False): + raise SkipTest('poll system call not supported') + subprocess._has_poll = False + ProcessTestCase.setUp(self) + + def tearDown(self): + subprocess._has_poll = True + ProcessTestCase.tearDown(self) + + +class HelperFunctionTests(unittest.TestCase): + # Not available with python-2.3's unittest. reimplement with SkipTest from + # nose + #@unittest.skipIf(mswindows, "errno and EINTR make no sense on windows") + def test_eintr_retry_call(self): + if mswindows: + raise SkipTest('errno and EINTR make no sense on windows') + record_calls = [] + def fake_os_func(*args): + record_calls.append(args) + if len(record_calls) == 2: + raise OSError(errno.EINTR, "fake interrupted system call") + # reversed() is not available in python-2.3 + args = list(args) + args.reverse() + return tuple(args) + + self.assertEqual((999, 256), + subprocess._eintr_retry_call(fake_os_func, 256, 999)) + self.assertEqual([(256, 999)], record_calls) + # This time there will be an EINTR so it will loop once. + self.assertEqual((666,), + subprocess._eintr_retry_call(fake_os_func, 666)) + self.assertEqual([(256, 999), (666,), (666,)], record_calls) + + +# SkipUnless is not available on python < 2.7, reimplement with nose +#@unittest.skipUnless(mswindows, "mswindows only") +class CommandsWithSpaces (BaseTestCase): + + def setUp(self): + if not mswindows: + raise SkipTest('mswindows only') + + super(CommandsWithSpaces, self).setUp() + f, fname = mkstemp(".py", "te st") + self.fname = fname.lower () + os.write(f, "import sys;" + "sys.stdout.write('%d %s' % (len(sys.argv), [a.lower () for a in sys.argv]))" + ) + os.close(f) + + def tearDown(self): + os.remove(self.fname) + super(CommandsWithSpaces, self).tearDown() + + def with_spaces(self, *args, **kwargs): + kwargs['stdout'] = subprocess.PIPE + p = subprocess.Popen(*args, **kwargs) + self.addCleanup(p.stdout.close) + self.assertEqual( + p.stdout.read ().decode("mbcs"), + "2 [%r, 'ab cd']" % self.fname + ) + + def test_shell_string_with_spaces(self): + # call() function with string argument with spaces on Windows + self.with_spaces('"%s" "%s" "%s"' % (sys.executable, self.fname, + "ab cd"), shell=1) + + def test_shell_sequence_with_spaces(self): + # call() function with sequence argument with spaces on Windows + self.with_spaces([sys.executable, self.fname, "ab cd"], shell=1) + + def test_noshell_string_with_spaces(self): + # call() function with string argument with spaces on Windows + self.with_spaces('"%s" "%s" "%s"' % (sys.executable, self.fname, + "ab cd")) + + def test_noshell_sequence_with_spaces(self): + # call() function with sequence argument with spaces on Windows + self.with_spaces([sys.executable, self.fname, "ab cd"]) + + +# We're using nosetests so we don't need this. This just leads to tests being +# run twice +#def test_main(): +# unit_tests = (ProcessTestCase, +# POSIXProcessTestCase, +# Win32ProcessTestCase, +# ProcessTestCaseNoPoll, +# HelperFunctionTests +# ) +# +# test_support.run_unittest(*unit_tests) +# test_support.reap_children() +# +#if __name__ == "__main__": +# test_main() diff --git a/tests/test_text_display.py b/tests/test_text_display.py new file mode 100644 index 0000000..d686977 --- /dev/null +++ b/tests/test_text_display.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +from kitchen.text.exceptions import ControlCharError + +from kitchen.text import display + +import base_classes + +class TestDisplay(base_classes.UnicodeTestData, unittest.TestCase): + + def test_internal_interval_bisearch(self): + '''Test that we can find things in an interval table''' + table = ((0, 3), (5,7), (9, 10)) + tools.ok_(display._interval_bisearch(0, table) == True) + tools.ok_(display._interval_bisearch(1, table) == True) + tools.ok_(display._interval_bisearch(2, table) == True) + tools.ok_(display._interval_bisearch(3, table) == True) + tools.ok_(display._interval_bisearch(5, table) == True) + tools.ok_(display._interval_bisearch(6, table) == True) + tools.ok_(display._interval_bisearch(7, table) == True) + tools.ok_(display._interval_bisearch(9, table) == True) + tools.ok_(display._interval_bisearch(10, table) == True) + tools.ok_(display._interval_bisearch(-1, table) == False) + tools.ok_(display._interval_bisearch(4, table) == False) + tools.ok_(display._interval_bisearch(8, table) == False) + tools.ok_(display._interval_bisearch(11, table) == False) + + def test_internal_generate_combining_table(self): + '''Test that the combining table we generate is equal to or a subseet of what's in the current table + + If we assert it can mean one of two things: + + 1. The code is broken + 2. The table we have is out of date. + ''' + old_table = display._COMBINING + new_table = display._generate_combining_table() + for interval in new_table: + if interval[0] == interval[1]: + tools.ok_(display._interval_bisearch(interval[0], old_table) == True) + else: + for codepoint in xrange(interval[0], interval[1] + 1): + tools.ok_(display._interval_bisearch(interval[0], old_table) == True) + + def test_internal_ucp_width(self): + '''Test that ucp_width returns proper width for characters''' + for codepoint in xrange(0, 0xFFFFF + 1): + if codepoint < 32 or (codepoint < 0xa0 and codepoint >= 0x7f): + # With strict on, we should raise an error + tools.assert_raises(ControlCharError, display._ucp_width, codepoint, 'strict') + + if codepoint in (0x08, 0x1b, 0x7f, 0x94): + # Backspace, delete, clear delete remove one char + tools.ok_(display._ucp_width(codepoint) == -1) + else: + # Everything else returns 0 + tools.ok_(display._ucp_width(codepoint) == 0) + elif display._interval_bisearch(codepoint, display._COMBINING): + # Combining character + tools.ok_(display._ucp_width(codepoint) == 0) + elif (codepoint >= 0x1100 and + (codepoint <= 0x115f or # Hangul Jamo init. consonants + codepoint == 0x2329 or codepoint == 0x232a or + (codepoint >= 0x2e80 and codepoint <= 0xa4cf and + codepoint != 0x303f) or # CJK ... Yi + (codepoint >= 0xac00 and codepoint <= 0xd7a3) or # Hangul Syllables + (codepoint >= 0xf900 and codepoint <= 0xfaff) or # CJK Compatibility Ideographs + (codepoint >= 0xfe10 and codepoint <= 0xfe19) or # Vertical forms + (codepoint >= 0xfe30 and codepoint <= 0xfe6f) or # CJK Compatibility Forms + (codepoint >= 0xff00 and codepoint <= 0xff60) or # Fullwidth Forms + (codepoint >= 0xffe0 and codepoint <= 0xffe6) or + (codepoint >= 0x20000 and codepoint <= 0x2fffd) or + (codepoint >= 0x30000 and codepoint <= 0x3fffd))): + tools.ok_(display._ucp_width(codepoint) == 2) + else: + tools.ok_(display._ucp_width(codepoint) == 1) + + def test_textual_width(self): + '''Test that we find the proper number of spaces that a utf8 string will consume''' + tools.ok_(display.textual_width(self.u_japanese) == 31) + tools.ok_(display.textual_width(self.u_spanish) == 50) + tools.ok_(display.textual_width(self.u_mixed) == 23) + + def test_textual_width_chop(self): + '''utf8_width_chop with byte strings''' + tools.ok_(display.textual_width_chop(self.u_mixed, 1000) == self.u_mixed) + tools.ok_(display.textual_width_chop(self.u_mixed, 23) == self.u_mixed) + tools.ok_(display.textual_width_chop(self.u_mixed, 22) == self.u_mixed[:-1]) + tools.ok_(display.textual_width_chop(self.u_mixed, 19) == self.u_mixed[:-4]) + tools.ok_(display.textual_width_chop(self.u_mixed, 1) == u'') + tools.ok_(display.textual_width_chop(self.u_mixed, 2) == self.u_mixed[0]) + tools.ok_(display.textual_width_chop(self.u_mixed, 3) == self.u_mixed[:2]) + tools.ok_(display.textual_width_chop(self.u_mixed, 4) == self.u_mixed[:3]) + tools.ok_(display.textual_width_chop(self.u_mixed, 5) == self.u_mixed[:4]) + tools.ok_(display.textual_width_chop(self.u_mixed, 6) == self.u_mixed[:5]) + tools.ok_(display.textual_width_chop(self.u_mixed, 7) == self.u_mixed[:5]) + tools.ok_(display.textual_width_chop(self.u_mixed, 8) == self.u_mixed[:6]) + tools.ok_(display.textual_width_chop(self.u_mixed, 9) == self.u_mixed[:7]) + tools.ok_(display.textual_width_chop(self.u_mixed, 10) == self.u_mixed[:8]) + tools.ok_(display.textual_width_chop(self.u_mixed, 11) == self.u_mixed[:9]) + tools.ok_(display.textual_width_chop(self.u_mixed, 12) == self.u_mixed[:10]) + tools.ok_(display.textual_width_chop(self.u_mixed, 13) == self.u_mixed[:10]) + tools.ok_(display.textual_width_chop(self.u_mixed, 14) == self.u_mixed[:11]) + tools.ok_(display.textual_width_chop(self.u_mixed, 15) == self.u_mixed[:12]) + tools.ok_(display.textual_width_chop(self.u_mixed, 16) == self.u_mixed[:13]) + tools.ok_(display.textual_width_chop(self.u_mixed, 17) == self.u_mixed[:14]) + tools.ok_(display.textual_width_chop(self.u_mixed, 18) == self.u_mixed[:15]) + tools.ok_(display.textual_width_chop(self.u_mixed, 19) == self.u_mixed[:15]) + tools.ok_(display.textual_width_chop(self.u_mixed, 20) == self.u_mixed[:16]) + tools.ok_(display.textual_width_chop(self.u_mixed, 21) == self.u_mixed[:17]) + + def test_textual_width_fill(self): + '''Pad a utf8 string''' + tools.ok_(display.textual_width_fill(self.u_mixed, 1) == self.u_mixed) + tools.ok_(display.textual_width_fill(self.u_mixed, 25) == self.u_mixed + u' ') + tools.ok_(display.textual_width_fill(self.u_mixed, 25, left=False) == u' ' + self.u_mixed) + tools.ok_(display.textual_width_fill(self.u_mixed, 25, chop=18) == self.u_mixed[:-4] + u' ') + tools.ok_(display.textual_width_fill(self.u_mixed, 25, chop=18, prefix=self.u_spanish, suffix=self.u_spanish) == self.u_spanish + self.u_mixed[:-4] + self.u_spanish + u' ') + tools.ok_(display.textual_width_fill(self.u_mixed, 25, chop=18) == self.u_mixed[:-4] + u' ') + tools.ok_(display.textual_width_fill(self.u_mixed, 25, chop=18, prefix=self.u_spanish, suffix=self.u_spanish) == self.u_spanish + self.u_mixed[:-4] + self.u_spanish + u' ') + + def test_internal_textual_width_le(self): + test_data = ''.join([self.u_mixed, self.u_spanish]) + tw = display.textual_width(test_data) + tools.ok_(display._textual_width_le(68, self.u_mixed, self.u_spanish) == (tw <= 68)) + tools.ok_(display._textual_width_le(69, self.u_mixed, self.u_spanish) == (tw <= 69)) + tools.ok_(display._textual_width_le(137, self.u_mixed, self.u_spanish) == (tw <= 137)) + tools.ok_(display._textual_width_le(138, self.u_mixed, self.u_spanish) == (tw <= 138)) + tools.ok_(display._textual_width_le(78, self.u_mixed, self.u_spanish) == (tw <= 78)) + tools.ok_(display._textual_width_le(79, self.u_mixed, self.u_spanish) == (tw <= 79)) + + def test_wrap(self): + '''Test that text wrapping works''' + tools.ok_(display.wrap(self.u_mixed) == [self.u_mixed]) + tools.ok_(display.wrap(self.u_paragraph) == self.u_paragraph_out) + tools.ok_(display.wrap(self.utf8_paragraph) == self.u_paragraph_out) + tools.ok_(display.wrap(self.u_mixed_para) == self.u_mixed_para_out) + tools.ok_(display.wrap(self.u_mixed_para, width=57, + initial_indent=' ', subsequent_indent='----') == + self.u_mixed_para_57_initial_subsequent_out) + + def test_fill(self): + tools.ok_(display.fill(self.u_paragraph) == u'\n'.join(self.u_paragraph_out)) + tools.ok_(display.fill(self.utf8_paragraph) == u'\n'.join(self.u_paragraph_out)) + tools.ok_(display.fill(self.u_mixed_para) == u'\n'.join(self.u_mixed_para_out)) + tools.ok_(display.fill(self.u_mixed_para, width=57, + initial_indent=' ', subsequent_indent='----') == + u'\n'.join(self.u_mixed_para_57_initial_subsequent_out)) + + def test_byte_string_textual_width_fill(self): + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 1) == self.utf8_mixed) + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 25) == self.utf8_mixed + ' ') + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 25, left=False) == ' ' + self.utf8_mixed) + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 25, chop=18) == self.u_mixed[:-4].encode('utf8') + ' ') + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 25, chop=18, prefix=self.utf8_spanish, suffix=self.utf8_spanish) == self.utf8_spanish + self.u_mixed[:-4].encode('utf8') + self.utf8_spanish + ' ') + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 25, chop=18) == self.u_mixed[:-4].encode('utf8') + ' ') + tools.ok_(display.byte_string_textual_width_fill(self.utf8_mixed, 25, chop=18, prefix=self.utf8_spanish, suffix=self.utf8_spanish) == self.utf8_spanish + self.u_mixed[:-4].encode('utf8') + self.utf8_spanish + ' ') + diff --git a/tests/test_text_misc.py b/tests/test_text_misc.py new file mode 100644 index 0000000..8f652e0 --- /dev/null +++ b/tests/test_text_misc.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools +from nose.plugins.skip import SkipTest + +try: + import chardet +except ImportError: + chardet = None + +from kitchen.text import misc +from kitchen.text.exceptions import ControlCharError +from kitchen.text.converters import to_unicode + +import base_classes + +class TestTextMisc(unittest.TestCase, base_classes.UnicodeTestData): + def test_guess_encoding_no_chardet(self): + # Test that unicode strings are not allowed + tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish) + + tools.ok_(misc.guess_encoding(self.utf8_spanish, disable_chardet=True) == 'utf-8') + tools.ok_(misc.guess_encoding(self.latin1_spanish, disable_chardet=True) == 'latin-1') + tools.ok_(misc.guess_encoding(self.utf8_japanese, disable_chardet=True) == 'utf-8') + tools.ok_(misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) == 'latin-1') + + def test_guess_encoding_with_chardet(self): + # We go this slightly roundabout way because multiple encodings can + # output the same byte sequence. What we're really interested in is + # if we can get the original unicode string without knowing the + # converters beforehand + tools.ok_(to_unicode(self.utf8_spanish, + misc.guess_encoding(self.utf8_spanish)) == self.u_spanish) + tools.ok_(to_unicode(self.latin1_spanish, + misc.guess_encoding(self.latin1_spanish)) == self.u_spanish) + tools.ok_(to_unicode(self.utf8_japanese, + misc.guess_encoding(self.utf8_japanese)) == self.u_japanese) + + def test_guess_encoding_with_chardet_installed(self): + if chardet: + tools.ok_(to_unicode(self.euc_jp_japanese, + misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese) + else: + raise SkipTest('chardet not installed, euc_jp will not be guessed correctly') + + def test_guess_encoding_with_chardet_uninstalled(self): + if chardet: + raise SkipTest('chardet installed, euc_jp will not be mangled') + else: + tools.ok_(to_unicode(self.euc_jp_japanese, + misc.guess_encoding(self.euc_jp_japanese)) == + self.u_mangled_euc_jp_as_latin1) + + def test_str_eq(self): + # str vs str: + tools.ok_(misc.str_eq(self.euc_jp_japanese, self.euc_jp_japanese) == True) + tools.ok_(misc.str_eq(self.utf8_japanese, self.utf8_japanese) == True) + tools.ok_(misc.str_eq(self.b_ascii, self.b_ascii) == True) + tools.ok_(misc.str_eq(self.euc_jp_japanese, self.latin1_spanish) == False) + tools.ok_(misc.str_eq(self.utf8_japanese, self.euc_jp_japanese) == False) + tools.ok_(misc.str_eq(self.b_ascii, self.b_ascii[:-2]) == False) + + # unicode vs unicode: + tools.ok_(misc.str_eq(self.u_japanese, self.u_japanese) == True) + tools.ok_(misc.str_eq(self.u_ascii, self.u_ascii) == True) + tools.ok_(misc.str_eq(self.u_japanese, self.u_spanish) == False) + tools.ok_(misc.str_eq(self.u_ascii, self.u_ascii[:-2]) == False) + + # unicode vs str with default utf-8 conversion: + tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese) == True) + tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii) == True) + tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese) == False) + tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii[:-2]) == False) + + # unicode vs str with explicit encodings: + tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese, encoding='euc_jp') == True) + tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='utf8') == True) + tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii, encoding='latin1') == True) + tools.ok_(misc.str_eq(self.u_japanese, self.euc_jp_japanese, encoding='latin1') == False) + tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='euc_jp') == False) + tools.ok_(misc.str_eq(self.u_japanese, self.utf8_japanese, encoding='euc_jp') == False) + tools.ok_(misc.str_eq(self.u_ascii, self.b_ascii[:-2], encoding='latin1') == False) + + # str vs unicode (reverse parameter order of unicode vs str) + tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese) == True) + tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii) == True) + tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese) == False) + tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii[:-2]) == False) + + tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese, encoding='euc_jp') == True) + tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='utf8') == True) + tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii, encoding='latin1') == True) + tools.ok_(misc.str_eq(self.euc_jp_japanese, self.u_japanese, encoding='latin1') == False) + tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='euc_jp') == False) + tools.ok_(misc.str_eq(self.utf8_japanese, self.u_japanese, encoding='euc_jp') == False) + tools.ok_(misc.str_eq(self.b_ascii, self.u_ascii[:-2], encoding='latin1') == False) + + + def test_process_control_chars(self): + tools.assert_raises(TypeError, misc.process_control_chars, 'byte string') + tools.assert_raises(ControlCharError, misc.process_control_chars, + *[self.u_ascii_chars], **{'strategy':'strict'}) + tools.ok_(misc.process_control_chars(self.u_ascii_chars, + strategy='ignore') == self.u_ascii_no_ctrl) + tools.ok_(misc.process_control_chars(self.u_ascii_chars, + strategy='replace') == self.u_ascii_ctrl_replace) + + def test_html_entities_unescape(self): + tools.assert_raises(TypeError, misc.html_entities_unescape, 'byte string') + tools.ok_(misc.html_entities_unescape(self.u_entity_escape) == self.u_entity) + tools.ok_(misc.html_entities_unescape(u'%s' + % self.u_entity_escape) == self.u_entity) + tools.ok_(misc.html_entities_unescape(u'a�b') == u'a�b') + tools.ok_(misc.html_entities_unescape(u'a�b') == u'a\ufffdb') + tools.ok_(misc.html_entities_unescape(u'a�b') == u'a\ufffdb') + + def test_byte_string_valid_xml(self): + tools.ok_(misc.byte_string_valid_xml(u'unicode string') == False) + + tools.ok_(misc.byte_string_valid_xml(self.utf8_japanese)) + tools.ok_(misc.byte_string_valid_xml(self.euc_jp_japanese, 'euc_jp')) + + tools.ok_(misc.byte_string_valid_xml(self.utf8_japanese, 'euc_jp') == False) + tools.ok_(misc.byte_string_valid_xml(self.euc_jp_japanese, 'utf8') == False) + + tools.ok_(misc.byte_string_valid_xml(self.utf8_ascii_chars) == False) + + def test_byte_string_valid_encoding(self): + '''Test that a byte sequence is validated''' + tools.ok_(misc.byte_string_valid_encoding(self.utf8_japanese) == True) + tools.ok_(misc.byte_string_valid_encoding(self.euc_jp_japanese, encoding='euc_jp') == True) + + def test_byte_string_invalid_encoding(self): + '''Test that we return False with non-encoded chars''' + tools.ok_(misc.byte_string_valid_encoding('\xff') == False) + tools.ok_(misc.byte_string_valid_encoding(self.euc_jp_japanese) == False) diff --git a/tests/test_text_utf8.py b/tests/test_text_utf8.py new file mode 100644 index 0000000..c380718 --- /dev/null +++ b/tests/test_text_utf8.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +import warnings + +from kitchen.text import utf8 + +import base_classes + +class TestUTF8(base_classes.UnicodeTestData, unittest.TestCase): + def setUp(self): + # All of the utf8* functions are deprecated + warnings.simplefilter('ignore', DeprecationWarning) + + def tearDown(self): + warnings.simplefilter('default', DeprecationWarning) + + def test_utf8_width(self): + '''Test that we find the proper number of spaces that a utf8 string will consume''' + tools.ok_(utf8.utf8_width(self.utf8_japanese) == 31) + tools.ok_(utf8.utf8_width(self.utf8_spanish) == 50) + tools.ok_(utf8.utf8_width(self.utf8_mixed) == 23) + + def test_utf8_width_non_utf8(self): + '''Test that we handle non-utf8 bytes in utf8_width without backtracing''' + # utf8_width() treats non-utf8 byte sequences as undecodable so you + # end up with less characters than normal. In this string: + # Python-2.7+ replaces problematic characters in a different manner + # than older pythons. + # Python >= 2.7: + # El veloz murci�lago salt� sobre el perro perezoso. + # Python < 2.7: + # El veloz murci�go salt�bre el perro perezoso. + if len(unicode(u'\xe9la'.encode('latin1'), 'utf8', 'replace')) == 1: + # Python < 2.7 + tools.ok_(utf8.utf8_width(self.latin1_spanish) == 45) + else: + # Python >= 2.7 + tools.ok_(utf8.utf8_width(self.latin1_spanish) == 50) + + def test_utf8_width_chop(self): + '''utf8_width_chop with byte strings''' + tools.ok_(utf8.utf8_width_chop(self.utf8_mixed) == (23, self.utf8_mixed)) + tools.ok_(utf8.utf8_width_chop(self.utf8_mixed, 23) == (23, self.utf8_mixed)) + tools.ok_(utf8.utf8_width_chop(self.utf8_mixed, 22) == (22, self.utf8_mixed[:-1])) + tools.ok_(utf8.utf8_width_chop(self.utf8_mixed, 19) == (18, self.u_mixed[:-4].encode('utf8'))) + tools.ok_(utf8.utf8_width_chop(self.utf8_mixed, 2) == (2, self.u_mixed[0].encode('utf8'))) + tools.ok_(utf8.utf8_width_chop(self.utf8_mixed, 1) == (0, '')) + + def test_utf8_width_chop_unicode(self): + '''utf8_width_chop with unicode input''' + tools.ok_(utf8.utf8_width_chop(self.u_mixed) == (23, self.u_mixed)) + tools.ok_(utf8.utf8_width_chop(self.u_mixed, 23) == (23, self.u_mixed)) + tools.ok_(utf8.utf8_width_chop(self.u_mixed, 22) == (22, self.u_mixed[:-1])) + tools.ok_(utf8.utf8_width_chop(self.u_mixed, 19) == (18, self.u_mixed[:-4])) + tools.ok_(utf8.utf8_width_chop(self.u_mixed, 2) == (2, self.u_mixed[0])) + tools.ok_(utf8.utf8_width_chop(self.u_mixed, 1) == (0, '')) + + def test_utf8_width_fill(self): + '''Pad a utf8 string''' + tools.ok_(utf8.utf8_width_fill(self.utf8_mixed, 1) == self.utf8_mixed) + tools.ok_(utf8.utf8_width_fill(self.utf8_mixed, 25) == self.utf8_mixed + ' ') + tools.ok_(utf8.utf8_width_fill(self.utf8_mixed, 25, left=False) == ' ' + self.utf8_mixed) + tools.ok_(utf8.utf8_width_fill(self.utf8_mixed, 25, chop=18) == self.u_mixed[:-4].encode('utf8') + ' ') + tools.ok_(utf8.utf8_width_fill(self.utf8_mixed, 25, chop=18, prefix=self.utf8_spanish, suffix=self.utf8_spanish) == self.utf8_spanish + self.u_mixed[:-4].encode('utf8') + self.utf8_spanish + ' ') + tools.ok_(utf8.utf8_width_fill(self.utf8_mixed, 25, chop=18) == self.u_mixed[:-4].encode('utf8') + ' ') + tools.ok_(utf8.utf8_width_fill(self.u_mixed, 25, chop=18, prefix=self.u_spanish, suffix=self.utf8_spanish) == self.u_spanish.encode('utf8') + self.u_mixed[:-4].encode('utf8') + self.u_spanish.encode('utf8') + ' ') + pass + + def test_utf8_valid(self): + '''Test that a utf8 byte sequence is validated''' + warnings.simplefilter('ignore', DeprecationWarning) + tools.ok_(utf8.utf8_valid(self.utf8_japanese) == True) + tools.ok_(utf8.utf8_valid(self.utf8_spanish) == True) + warnings.simplefilter('default', DeprecationWarning) + + def test_utf8_invalid(self): + '''Test that we return False with non-utf8 chars''' + warnings.simplefilter('ignore', DeprecationWarning) + tools.ok_(utf8.utf8_valid('\xff') == False) + tools.ok_(utf8.utf8_valid(self.latin1_spanish) == False) + warnings.simplefilter('default', DeprecationWarning) + + def test_utf8_text_wrap(self): + tools.ok_(utf8.utf8_text_wrap(self.utf8_mixed) == [self.utf8_mixed]) + tools.ok_(utf8.utf8_text_wrap(self.utf8_paragraph) == self.utf8_paragraph_out) + tools.ok_(utf8.utf8_text_wrap(self.utf8_mixed_para) == self.utf8_mixed_para_out) + tools.ok_(utf8.utf8_text_wrap(self.utf8_mixed_para, width=57, + initial_indent=' ', subsequent_indent='----') == + self.utf8_mixed_para_57_initial_subsequent_out) diff --git a/tests/test_versioning.py b/tests/test_versioning.py new file mode 100644 index 0000000..7fe2ce5 --- /dev/null +++ b/tests/test_versioning.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# +import unittest +from nose import tools + +from kitchen.versioning import version_tuple_to_string + +# Note: Using nose's generator tests for this so we can't subclass +# unittest.TestCase +class TestVersionTuple(object): + ver_to_tuple = {u'1': ((1,),), + u'1.0': ((1, 0),), + u'1.0.0': ((1, 0, 0),), + u'1.0a1': ((1, 0), ('a', 1)), + u'1.0a1': ((1, 0), (u'a', 1)), + u'1.0rc1': ((1, 0), ('rc', 1)), + u'1.0rc1': ((1, 0), (u'rc', 1)), + u'1.0rc1.2': ((1, 0), ('rc', 1, 2)), + u'1.0rc1.2': ((1, 0), (u'rc', 1, 2)), + u'1.0.dev345': ((1, 0), ('dev', 345)), + u'1.0.dev345': ((1, 0), (u'dev', 345)), + u'1.0a1.dev345': ((1, 0), ('a', 1), ('dev', 345)), + u'1.0a1.dev345': ((1, 0), (u'a', 1), (u'dev', 345)), + u'1.0a1.2.dev345': ((1, 0), ('a', 1, 2), ('dev', 345)), + u'1.0a1.2.dev345': ((1, 0), (u'a', 1, 2), (u'dev', 345)), + } + + def check_ver_tuple_to_str(self, v_tuple, v_str): + tools.ok_(version_tuple_to_string(v_tuple) == v_str) + + def test_version_tuple_to_string(self): + '''Test that version_tuple_to_string outputs PEP-386 compliant strings + ''' + for v_str, v_tuple in self.ver_to_tuple.items(): + yield self.check_ver_tuple_to_str, v_tuple, v_str