PoDoFo 1.2.0
Loading...
Searching...
No Matches
PdfTokenizer.h
1// SPDX-FileCopyrightText: 2006 Dominik Seichter <domseichter@web.de>
2// SPDX-FileCopyrightText: 2020 Francesco Pretto <ceztko@gmail.com>
3// SPDX-License-Identifier: LGPL-2.0-or-later OR MPL-2.0
4
5#ifndef PDF_TOKENIZER_H
6#define PDF_TOKENIZER_H
7
8#include "PdfDeclarations.h"
9#include <podofo/auxiliary/InputDevice.h>
10#include "PdfStatefulEncrypt.h"
11
12#include <deque>
13
14namespace PoDoFo {
15
16class PdfVariant;
17
18enum class PdfPostScriptLanguageLevel : uint8_t
19{
20 L1 = 1,
21 L2 = 2,
22};
23
24struct PODOFO_API PdfTokenizerOptions final
25{
26 PdfPostScriptLanguageLevel LanguageLevel = PdfPostScriptLanguageLevel::L2;
27 bool ReadReferences = true;
28};
29
31class PODOFO_API PdfTokenizer
32{
33 friend class PdfParser;
34 friend class PdfPostScriptTokenizer;
35 PODOFO_PRIVATE_FRIEND(class PdfParserObject);
36
37public:
38 static constexpr unsigned BufferSize = 4096;
39 static constexpr size_t MaxStringLength = 64 * 1024 * 1024; // 64 MiB
40
41public:
42 PdfTokenizer(const PdfTokenizerOptions& options = { });
43 PdfTokenizer(std::shared_ptr<charbuff> buffer, const PdfTokenizerOptions& options = { });
44
62 bool TryReadNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
63
69 bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
70
79 int64_t ReadNextNumber(InputStreamDevice& device);
80 bool TryReadNextNumber(InputStreamDevice& device, int64_t& value);
81
90 void ReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
91 bool TryReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
92
93 void Reset();
94
95protected:
96 // This enum differs from regular PdfDataType in the sense
97 // it enumerates only data types that can be determined literally
98 // by the tokenization and specify better if the strings literals
99 // are regular or hex strings
100 enum class PdfLiteralDataType : uint8_t
101 {
102 Unknown = 0,
103 Bool,
104 Number,
105 Real,
106 String,
107 HexString,
108 Name,
109 Array,
110 Dictionary,
111 Null,
112 Reference,
113 };
114
115protected:
125 void ReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
126 bool TryReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
127
136 void EnqueueToken(const std::string_view& token, PdfTokenType type);
137
143 void ReadDictionary(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
144
150 void ReadArray(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
151
157 void ReadString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
158
164 void ReadHexString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
165
173 void ReadName(InputStreamDevice& device, PdfVariant& variant);
174
180 PdfLiteralDataType DetermineDataType(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant);
181
182private:
183 PdfTokenizer(std::in_place_t, std::shared_ptr<charbuff>&& buffer, const PdfTokenizerOptions& options);
184 bool tryReadDataType(InputStreamDevice& device, PdfLiteralDataType dataType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
185
186private:
187 using TokenizerPair = std::pair<std::string, PdfTokenType>;
188 using TokenizerQueue = std::deque<TokenizerPair>;
189
190private:
191 std::shared_ptr<charbuff> m_buffer;
192 PdfTokenizerOptions m_options;
193 TokenizerQueue m_tokenQueue;
194 charbuff m_charBuffer;
195};
196
197};
198
199#endif // PDF_TOKENIZER_H
This file should be included as the FIRST file in every header of PoDoFo lib.
This class represents an input device It optionally supports peeking.
Definition InputDevice.h:19
This class is a parser for general PostScript content in PDF documents.
Definition PdfPostScriptTokenizer.h:25
A simple tokenizer for PDF files and PDF content streams.
Definition PdfTokenizer.h:32
bool TryReadNextToken(InputStreamDevice &device, std::string_view &token)
Reads the next token from the current file position ignoring all comments.
bool TryPeekNextToken(InputStreamDevice &device, std::string_view &token)
Try peek the next token from the current file position ignoring all comments, without actually consum...
void ReadNextVariant(InputStreamDevice &device, const std::string_view &token, PdfTokenType tokenType, PdfVariant &variant, const PdfStatefulEncrypt *encrypt)
Read the next variant from the current file position ignoring all comments.
A variant data type which supports all data types supported by the PDF standard.
Definition PdfVariant.h:29
Convenient type for char array storage and/or buffer with std::string compatibility.
Definition basetypes.h:30
All classes, functions, types and enums of PoDoFo are members of these namespace.
Definition basetypes.h:13