PoDoFo 1.1.0
Loading...
Searching...
No Matches
PdfTokenizer.h
1
7#ifndef PDF_TOKENIZER_H
8#define PDF_TOKENIZER_H
9
10#include "PdfDeclarations.h"
11#include <podofo/auxiliary/InputDevice.h>
12#include "PdfStatefulEncrypt.h"
13
14#include <deque>
15
16namespace PoDoFo {
17
18class PdfVariant;
19
20enum class PdfPostScriptLanguageLevel : uint8_t
21{
22 L1 = 1,
23 L2 = 2,
24};
25
26struct PODOFO_API PdfTokenizerOptions final
27{
28 PdfPostScriptLanguageLevel LanguageLevel = PdfPostScriptLanguageLevel::L2;
29 bool ReadReferences = true;
30};
31
35class PODOFO_API PdfTokenizer
36{
37 friend class PdfParser;
38 friend class PdfPostScriptTokenizer;
39 PODOFO_PRIVATE_FRIEND(class PdfParserObject);
40
41public:
42 static constexpr unsigned BufferSize = 4096;
43 static constexpr size_t MaxStringLength = 64 * 1024 * 1024; // 64 MiB
44
45public:
46 PdfTokenizer(const PdfTokenizerOptions& options = { });
47 PdfTokenizer(std::shared_ptr<charbuff> buffer, const PdfTokenizerOptions& options = { });
48
67 bool TryReadNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
68
75 bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
76
86 int64_t ReadNextNumber(InputStreamDevice& device);
87 bool TryReadNextNumber(InputStreamDevice& device, int64_t& value);
88
98 void ReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
99 bool TryReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
100
101 void Reset();
102
103protected:
104 // This enum differs from regular PdfDataType in the sense
105 // it enumerates only data types that can be determined literally
106 // by the tokenization and specify better if the strings literals
107 // are regular or hex strings
108 enum class PdfLiteralDataType : uint8_t
109 {
110 Unknown = 0,
111 Bool,
112 Number,
113 Real,
114 String,
115 HexString,
116 Name,
117 Array,
118 Dictionary,
119 Null,
120 Reference,
121 };
122
123protected:
134 void ReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
135 bool TryReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
136
146 void EnqueueToken(const std::string_view& token, PdfTokenType type);
147
154 void ReadDictionary(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
155
162 void ReadArray(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
163
170 void ReadString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
171
178 void ReadHexString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
179
187 void ReadName(InputStreamDevice& device, PdfVariant& variant);
188
195 PdfLiteralDataType DetermineDataType(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant);
196
197private:
198 PdfTokenizer(std::in_place_t, std::shared_ptr<charbuff>&& buffer, const PdfTokenizerOptions& options);
199 bool tryReadDataType(InputStreamDevice& device, PdfLiteralDataType dataType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
200
201private:
202 using TokenizerPair = std::pair<std::string, PdfTokenType>;
203 using TokenizerQueue = std::deque<TokenizerPair>;
204
205private:
206 std::shared_ptr<charbuff> m_buffer;
207 PdfTokenizerOptions m_options;
208 TokenizerQueue m_tokenQueue;
209 charbuff m_charBuffer;
210};
211
212};
213
214#endif // PDF_TOKENIZER_H
SPDX-FileCopyrightText: (C) 2005 Dominik Seichter domseichter@web.de SPDX-FileCopyrightText: (C) 2020...
This class represents an input device It optionally supports peeking.
Definition InputDevice.h:22
This class is a parser for general PostScript content in PDF documents.
Definition PdfPostScriptTokenizer.h:30
A simple tokenizer for PDF files and PDF content streams.
Definition PdfTokenizer.h:36
bool TryReadNextToken(InputStreamDevice &device, std::string_view &token)
Reads the next token from the current file position ignoring all comments.
bool TryPeekNextToken(InputStreamDevice &device, std::string_view &token)
Try peek the next token from the current file position ignoring all comments, without actually consum...
void ReadNextVariant(InputStreamDevice &device, const std::string_view &token, PdfTokenType tokenType, PdfVariant &variant, const PdfStatefulEncrypt *encrypt)
Read the next variant from the current file position ignoring all comments.
A variant data type which supports all data types supported by the PDF standard.
Definition PdfVariant.h:33
Convenient type for char array storage and/or buffer with std::string compatibility.
Definition basetypes.h:38
SPDX-FileCopyrightText: (C) 2022 Francesco Pretto ceztko@gmail.com SPDX-License-Identifier: LGPL-2....
Definition basetypes.h:16