PoDoFo 1.1.0
Loading...
Searching...
No Matches
PdfTokenizer.h
1
7#ifndef PDF_TOKENIZER_H
8#define PDF_TOKENIZER_H
9
10#include "PdfDeclarations.h"
11#include <podofo/auxiliary/InputDevice.h>
12#include "PdfStatefulEncrypt.h"
13
14#include <deque>
15
16namespace PoDoFo {
17
18class PdfVariant;
19
20enum class PdfPostScriptLanguageLevel : uint8_t
21{
22 L1 = 1,
23 L2 = 2,
24};
25
26struct PODOFO_API PdfTokenizerOptions final
27{
28 PdfPostScriptLanguageLevel LanguageLevel = PdfPostScriptLanguageLevel::L2;
29 bool ReadReferences = true;
30};
31
35class PODOFO_API PdfTokenizer
36{
37 friend class PdfParser;
38 friend class PdfPostScriptTokenizer;
39 PODOFO_PRIVATE_FRIEND(class PdfParserObject);
40
41public:
42 static constexpr unsigned BufferSize = 4096;
43
44public:
45 PdfTokenizer(const PdfTokenizerOptions& options = { });
46 PdfTokenizer(std::shared_ptr<charbuff> buffer, const PdfTokenizerOptions& options = { });
47
66 bool TryReadNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
67
74 bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
75
85 int64_t ReadNextNumber(InputStreamDevice& device);
86 bool TryReadNextNumber(InputStreamDevice& device, int64_t& value);
87
97 void ReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
98 bool TryReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
99
100 void Reset();
101
102protected:
103 // This enum differs from regular PdfDataType in the sense
104 // it enumerates only data types that can be determined literally
105 // by the tokenization and specify better if the strings literals
106 // are regular or hex strings
107 enum class PdfLiteralDataType : uint8_t
108 {
109 Unknown = 0,
110 Bool,
111 Number,
112 Real,
113 String,
114 HexString,
115 Name,
116 Array,
117 Dictionary,
118 Null,
119 Reference,
120 };
121
122protected:
133 void ReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
134 bool TryReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
135
145 void EnqueueToken(const std::string_view& token, PdfTokenType type);
146
153 void ReadDictionary(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
154
161 void ReadArray(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
162
169 void ReadString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
170
177 void ReadHexString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
178
186 void ReadName(InputStreamDevice& device, PdfVariant& variant);
187
194 PdfLiteralDataType DetermineDataType(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant);
195
196private:
197 PdfTokenizer(std::in_place_t, std::shared_ptr<charbuff>&& buffer, const PdfTokenizerOptions& options);
198 bool tryReadDataType(InputStreamDevice& device, PdfLiteralDataType dataType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
199
200private:
201 using TokenizerPair = std::pair<std::string, PdfTokenType>;
202 using TokenizerQueue = std::deque<TokenizerPair>;
203
204private:
205 std::shared_ptr<charbuff> m_buffer;
206 PdfTokenizerOptions m_options;
207 TokenizerQueue m_tokenQueue;
208 charbuff m_charBuffer;
209};
210
211};
212
213#endif // PDF_TOKENIZER_H
SPDX-FileCopyrightText: (C) 2005 Dominik Seichter domseichter@web.de SPDX-FileCopyrightText: (C) 2020...
This class represents an input device It optionally supports peeking.
Definition InputDevice.h:22
This class is a parser for general PostScript content in PDF documents.
Definition PdfPostScriptTokenizer.h:30
A simple tokenizer for PDF files and PDF content streams.
Definition PdfTokenizer.h:36
bool TryReadNextToken(InputStreamDevice &device, std::string_view &token)
Reads the next token from the current file position ignoring all comments.
bool TryPeekNextToken(InputStreamDevice &device, std::string_view &token)
Try peek the next token from the current file position ignoring all comments, without actually consum...
void ReadNextVariant(InputStreamDevice &device, const std::string_view &token, PdfTokenType tokenType, PdfVariant &variant, const PdfStatefulEncrypt *encrypt)
Read the next variant from the current file position ignoring all comments.
A variant data type which supports all data types supported by the PDF standard.
Definition PdfVariant.h:33
Convenient type for char array storage and/or buffer with std::string compatibility.
Definition basetypes.h:38
SPDX-FileCopyrightText: (C) 2022 Francesco Pretto ceztko@gmail.com SPDX-License-Identifier: LGPL-2....
Definition basetypes.h:16