PoDoFo  1.0.0-dev
PdfTokenizer.h
1 
7 #ifndef PDF_TOKENIZER_H
8 #define PDF_TOKENIZER_H
9 
10 #include "PdfDeclarations.h"
11 #include <podofo/auxiliary/InputDevice.h>
12 #include "PdfStatefulEncrypt.h"
13 
14 #include <deque>
15 
16 namespace PoDoFo {
17 
18 class PdfVariant;
19 
20 enum class PdfTokenType
21 {
22  Unknown = 0,
23  Literal,
24  ParenthesisLeft,
25  ParenthesisRight,
26  BraceLeft,
27  BraceRight,
28  AngleBracketLeft,
29  AngleBracketRight,
30  DoubleAngleBracketsLeft,
31  DoubleAngleBracketsRight,
32  SquareBracketLeft,
33  SquareBracketRight,
34  Slash,
35 };
36 
37 enum class PdfPostScriptLanguageLevel
38 {
39  L1 = 1,
40  L2 = 2,
41 };
42 
43 struct PODOFO_API PdfTokenizerOptions final
44 {
45  PdfPostScriptLanguageLevel LanguageLevel = PdfPostScriptLanguageLevel::L2;
46  bool ReadReferences = true;
47 };
48 
52 class PODOFO_API PdfTokenizer
53 {
54  PODOFO_PRIVATE_FRIEND(class PdfParserObject);
55 
56 public:
57  static constexpr unsigned BufferSize = 4096;
58 
59 public:
60  PdfTokenizer(const PdfTokenizerOptions& options = { });
61  PdfTokenizer(const std::shared_ptr<charbuff>& buffer, const PdfTokenizerOptions& options = { });
62 
80  bool TryReadNextToken(InputStreamDevice& device, std::string_view& token);
81  bool TryReadNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
82 
88  bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token);
89  bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
90 
100  int64_t ReadNextNumber(InputStreamDevice& device);
101  bool TryReadNextNumber(InputStreamDevice& device, int64_t& value);
102 
112  void ReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
113  bool TryReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
114 
115 public:
121  static bool IsWhitespace(char ch);
122 
126  static bool IsDelimiter(char ch);
127 
130  static bool IsTokenDelimiter(char ch, PdfTokenType& tokenType);
131 
137  static bool IsRegular(char ch);
138 
143  static bool IsPrintable(char ch);
144 
145 protected:
146  // This enum differs from regular PdfDataType in the sense
147  // it enumerates only data types that can be determined literally
148  // by the tokenization and specify better if the strings literals
149  // are regular or hex strings
150  enum class PdfLiteralDataType
151  {
152  Unknown = 0,
153  Bool,
154  Number,
155  Real,
156  String,
157  HexString,
158  Name,
159  Array,
160  Dictionary,
161  Null,
162  Reference,
163  };
164 
165 protected:
176  void ReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
177  bool TryReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
178 
188  void EnqueueToken(const std::string_view& token, PdfTokenType type);
189 
196  void ReadDictionary(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
197 
204  void ReadArray(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
205 
212  void ReadString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
213 
220  void ReadHexString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
221 
229  void ReadName(InputStreamDevice& device, PdfVariant& variant);
230 
237  PdfLiteralDataType DetermineDataType(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant);
238 
239 private:
240  bool tryReadDataType(InputStreamDevice& device, PdfLiteralDataType dataType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
241 
242 private:
243  using TokenizerPair = std::pair<std::string, PdfTokenType>;
244  using TokenizerQueque = std::deque<TokenizerPair>;
245 
246 private:
247  std::shared_ptr<charbuff> m_buffer;
248  PdfTokenizerOptions m_options;
249  TokenizerQueque m_tokenQueque;
250  charbuff m_charBuffer;
251 };
252 
253 };
254 
255 #endif // PDF_TOKENIZER_H
SPDX-FileCopyrightText: (C) 2005 Dominik Seichter domseichter@web.de SPDX-FileCopyrightText: (C) 2020...
This class represents an input device It optionally supports peeking.
Definition: InputDevice.h:22
A simple tokenizer for PDF files and PDF content streams.
Definition: PdfTokenizer.h:53
bool TryReadNextToken(InputStreamDevice &device, std::string_view &token)
Reads the next token from the current file position ignoring all comments.
bool TryPeekNextToken(InputStreamDevice &device, std::string_view &token)
Try peek the next token from the current file position ignoring all comments, without actually consum...
void ReadNextVariant(InputStreamDevice &device, const std::string_view &token, PdfTokenType tokenType, PdfVariant &variant, const PdfStatefulEncrypt *encrypt)
Read the next variant from the current file position ignoring all comments.
A variant data type which supports all data types supported by the PDF standard.
Definition: PdfVariant.h:33
Convenient type for char array storage and/or buffer with std::string compatibility.
Definition: basetypes.h:38
SPDX-FileCopyrightText: (C) 2022 Francesco Pretto ceztko@gmail.com SPDX-License-Identifier: LGPL-2....
Definition: basetypes.h:16