PoDoFo  1.0.0-dev
PdfCharCodeMap.h
1 
7 #ifndef PDF_CHAR_CODE_MAP_H
8 #define PDF_CHAR_CODE_MAP_H
9 
10 #include "PdfDeclarations.h"
11 #include "PdfEncodingCommon.h"
12 
13 namespace PoDoFo
14 {
21  using codepoint = char32_t;
22  using codepointview = cspan<codepoint>;
23 
28  class PODOFO_API CodePointSpan final
29  {
30  public:
31  CodePointSpan();
32  ~CodePointSpan();
34  CodePointSpan(const codepointview& view);
35  CodePointSpan(const codepointview& view, codepoint codepoint);
37  void CopyTo(std::vector<codepoint>& codePoints) const;
38  codepointview view() const;
39  unsigned GetSize() const;
40  CodePointSpan& operator=(const CodePointSpan&);
41  operator codepointview() const;
42 
47  codepoint operator*() const;
48 
49  private:
50  union
51  {
52  struct
53  {
54  uint32_t Size;
55  std::array<codepoint, 3> Data;
56  } m_Block;
57 
58  struct
59  {
60  uint32_t Size;
61  std::unique_ptr<codepoint[]> Data;
62  } m_Array;
63  };
64  };
65 
66  // Map code units -> code point(s)
67  // pp. 474-475 of PdfReference 1.7 "The value of dstString can be a string of up to 512 bytes"
68  using CodeUnitMap = std::unordered_map<PdfCharCode, CodePointSpan>;
69 
70  struct PODOFO_API CodeUnitRange final
71  {
72  PdfCharCode SrcCodeLo;
73  unsigned Size = 0;
74  CodePointSpan DstCodeLo;
75 
76  PdfCharCode GetSrcCodeHi() const;
77  };
78 
79  struct PODOFO_API CodeUnitRangeInequality
80  {
81  using is_transparent = std::true_type;
82 
83  bool operator()(const CodeUnitRange& lhs, const PdfCharCode& rhs) const
84  {
85  return lhs.SrcCodeLo < rhs;
86  }
87  bool operator()(const PdfCharCode& lhs, const CodeUnitRange& rhs) const
88  {
89  return lhs < rhs.SrcCodeLo;
90  }
91  bool operator()(const CodeUnitRange& lhs, const CodeUnitRange& rhs) const
92  {
93  return lhs.SrcCodeLo < rhs.SrcCodeLo;
94  }
95  };
96 
97  using CodeUnitRanges = std::set<CodeUnitRange, CodeUnitRangeInequality>;
98 
108  class PODOFO_API PdfCharCodeMap final
109  {
110  PODOFO_PRIVATE_FRIEND(class PdfCMapEncodingFactory);
111 
112  public:
113  PdfCharCodeMap();
114 
115  PdfCharCodeMap(PdfCharCodeMap&& map) noexcept;
116 
117  ~PdfCharCodeMap();
118 
119  private:
120  PdfCharCodeMap(CodeUnitMap&& mapping, CodeUnitRanges&& ranges, const PdfEncodingLimits& limits);
121 
122  public:
127  void PushMapping(const PdfCharCode& codeUnit, const codepointview& codePoints);
128 
131  void PushMapping(const PdfCharCode& codeUnit, codepoint codePoint);
132 
137  void PushRange(const PdfCharCode& srcCodeLo, unsigned size, codepoint dstCodeLo);
138 
143  void PushRange(const PdfCharCode& srcCodeLo, unsigned size, const codepointview& dstCodeLo);
144 
147  bool TryGetCodePoints(const PdfCharCode& codeUnit, CodePointSpan& codePoints) const;
148 
153  bool TryGetNextCharCode(std::string_view::iterator& it,
154  const std::string_view::iterator& end, PdfCharCode& code) const;
155 
159  bool TryGetCharCode(const codepointview& codePoints, PdfCharCode& code) const;
160 
163  bool TryGetCharCode(codepoint codePoint, PdfCharCode& code) const;
164 
165  PdfCharCodeMap& operator=(PdfCharCodeMap&& map) noexcept;
166 
167  const PdfEncodingLimits& GetLimits() const { return m_Limits; }
168 
169  bool IsEmpty() const;
170 
173  bool IsTrivialIdentity() const;
174 
177  std::vector<unsigned char> GetCodeRangeSizes() const;
178 
179  public:
182  const CodeUnitMap& GetMappings() const { return m_Mappings; }
183 
186  const CodeUnitRanges& GetRanges() const { return m_Ranges; }
187 
188  private:
189  void move(PdfCharCodeMap& map) noexcept;
190  void pushMapping(const PdfCharCode& codeUnit, const codepointview& codePoints);
191 
192  // Map code point(s) -> code units
193  struct CodePointMapNode
194  {
195  codepoint CodePoint;
196  PdfCharCode CodeUnit;
197  CodePointMapNode* Ligatures;
198  CodePointMapNode* Left;
199  CodePointMapNode* Right;
200  };
201 
202  private:
203  PdfCharCodeMap(const PdfCharCodeMap&) = delete;
204  PdfCharCodeMap& operator=(const PdfCharCodeMap&) = delete;
205 
206  private:
207  void updateLimits(const PdfCharCode& codeUnit);
208  void reviseCodePointMap();
209  bool tryFixNextRanges(const CodeUnitRanges::iterator& it, unsigned prevRangeCodeUpper);
210  static bool tryFindNextCharacterId(const CodePointMapNode* node, std::string_view::iterator &it,
211  const std::string_view::iterator& end, PdfCharCode& cid);
212  static const CodePointMapNode* findNode(const CodePointMapNode* node, codepoint codePoint);
213  static void deleteNode(CodePointMapNode* node);
214  static CodePointMapNode* findOrAddNode(CodePointMapNode*& node, codepoint codePoint);
215 
216  private:
217  PdfEncodingLimits m_Limits;
218  CodeUnitMap m_Mappings;
219  CodeUnitRanges m_Ranges;
220  bool m_MapDirty;
221  CodePointMapNode* m_codePointMapHead; // Head of a BST to lookup code points
222  };
223 }
224 
225 #endif // PDF_CHAR_CODE_MAP_H
SPDX-FileCopyrightText: (C) 2005 Dominik Seichter domseichter@web.de SPDX-FileCopyrightText: (C) 2020...
A memory owning immutable block of code points, optimized for small segments as up to 3 elements can ...
Definition: PdfCharCodeMap.h:29
A bidirectional map from character code units to unspecified code points.
Definition: PdfCharCodeMap.h:109
const CodeUnitMap & GetMappings() const
Provides direct mappings.
Definition: PdfCharCodeMap.h:182
const CodeUnitRanges & GetRanges() const
Provides range mappings.
Definition: PdfCharCodeMap.h:186
SPDX-FileCopyrightText: (C) 2022 Francesco Pretto ceztko@gmail.com SPDX-License-Identifier: LGPL-2....
Definition: basetypes.h:16
char32_t codepoint
A convenient typedef for an unspecified codepoint The underlying type is convenientely char32_t so it...
Definition: PdfCharCodeMap.h:21
tcb::span< const T, Extent > cspan
Constant span.
Definition: span.h:13
A character code unit.
Definition: PdfEncodingCommon.h:20