stringops.h 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. ///////////////////////////////////////////////////////////////////////////////
  2. // Name: wx/stringops.h
  3. // Purpose: implementation of wxString primitive operations
  4. // Author: Vaclav Slavik
  5. // Modified by:
  6. // Created: 2007-04-16
  7. // Copyright: (c) 2007 REA Elektronik GmbH
  8. // Licence: wxWindows licence
  9. ///////////////////////////////////////////////////////////////////////////////
  10. #ifndef _WX_WXSTRINGOPS_H__
  11. #define _WX_WXSTRINGOPS_H__
  12. #include "wx/chartype.h"
  13. #include "wx/stringimpl.h"
  14. #include "wx/unichar.h"
  15. #include "wx/buffer.h"
  16. // This header contains wxStringOperations "namespace" class that implements
  17. // elementary operations on string data as static methods; wxString methods and
  18. // iterators are implemented in terms of it. Two implementations are available,
  19. // one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or
  20. // char* in ANSI build).
  21. // FIXME-UTF8: only wchar after we remove ANSI build
  22. #if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
  23. struct WXDLLIMPEXP_BASE wxStringOperationsWchar
  24. {
  25. // moves the iterator to the next Unicode character
  26. template <typename Iterator>
  27. static void IncIter(Iterator& i) { ++i; }
  28. // moves the iterator to the previous Unicode character
  29. template <typename Iterator>
  30. static void DecIter(Iterator& i) { --i; }
  31. // moves the iterator by n Unicode characters
  32. template <typename Iterator>
  33. static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
  34. { return i + n; }
  35. // returns distance of the two iterators in Unicode characters
  36. template <typename Iterator>
  37. static ptrdiff_t DiffIters(const Iterator& i1, const Iterator& i2)
  38. { return i1 - i2; }
  39. // encodes the character to a form used to represent it in internal
  40. // representation (returns a string in UTF8 version)
  41. static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
  42. static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
  43. { return *i; }
  44. };
  45. #endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
  46. #if wxUSE_UNICODE_UTF8
  47. struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
  48. {
  49. // checks correctness of UTF-8 sequence
  50. static bool IsValidUtf8String(const char *c,
  51. size_t len = wxStringImpl::npos);
  52. static bool IsValidUtf8LeadByte(unsigned char c)
  53. {
  54. return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
  55. }
  56. // table of offsets to skip forward when iterating over UTF-8 sequence
  57. static const unsigned char ms_utf8IterTable[256];
  58. template<typename Iterator>
  59. static void IncIter(Iterator& i)
  60. {
  61. wxASSERT( IsValidUtf8LeadByte(*i) );
  62. i += ms_utf8IterTable[(unsigned char)*i];
  63. }
  64. template<typename Iterator>
  65. static void DecIter(Iterator& i)
  66. {
  67. // Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
  68. // binary), so we just have to go back until we hit a byte that is
  69. // either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in
  70. // binary; this includes some invalid values, but we can ignore it
  71. // here, because we assume valid UTF-8 input for the purpose of
  72. // efficient implementation).
  73. --i;
  74. while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
  75. --i;
  76. }
  77. template<typename Iterator>
  78. static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
  79. {
  80. Iterator out(i);
  81. if ( n > 0 )
  82. {
  83. for ( ptrdiff_t j = 0; j < n; ++j )
  84. IncIter(out);
  85. }
  86. else if ( n < 0 )
  87. {
  88. for ( ptrdiff_t j = 0; j > n; --j )
  89. DecIter(out);
  90. }
  91. return out;
  92. }
  93. template<typename Iterator>
  94. static ptrdiff_t DiffIters(Iterator i1, Iterator i2)
  95. {
  96. ptrdiff_t dist = 0;
  97. if ( i1 < i2 )
  98. {
  99. while ( i1 != i2 )
  100. {
  101. IncIter(i1);
  102. dist--;
  103. }
  104. }
  105. else if ( i2 < i1 )
  106. {
  107. while ( i2 != i1 )
  108. {
  109. IncIter(i2);
  110. dist++;
  111. }
  112. }
  113. return dist;
  114. }
  115. // encodes the character as UTF-8:
  116. typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
  117. static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
  118. { return ch.AsUTF8(); }
  119. // returns n copies of ch encoded in UTF-8 string
  120. static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
  121. // returns the length of UTF-8 encoding of the character with lead byte 'c'
  122. static size_t GetUtf8CharLength(char c)
  123. {
  124. wxASSERT( IsValidUtf8LeadByte(c) );
  125. return ms_utf8IterTable[(unsigned char)c];
  126. }
  127. // decodes single UTF-8 character from UTF-8 string
  128. static wxUniChar DecodeChar(wxStringImpl::const_iterator i)
  129. {
  130. if ( (unsigned char)*i < 0x80 )
  131. return (int)*i;
  132. return DecodeNonAsciiChar(i);
  133. }
  134. private:
  135. static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i);
  136. };
  137. #endif // wxUSE_UNICODE_UTF8
  138. #if wxUSE_UNICODE_UTF8
  139. typedef wxStringOperationsUtf8 wxStringOperations;
  140. #else
  141. typedef wxStringOperationsWchar wxStringOperations;
  142. #endif
  143. #endif // _WX_WXSTRINGOPS_H_