PoDoFo 1.0.0-dev
Loading...
Searching...
No Matches
PdfTokenizer.h
1
7#ifndef PDF_TOKENIZER_H
8#define PDF_TOKENIZER_H
9
10#include "PdfDeclarations.h"
11#include <podofo/auxiliary/InputDevice.h>
12#include "PdfStatefulEncrypt.h"
13
14#include <deque>
15
16namespace PoDoFo {
17
18class PdfVariant;
19
20enum class PdfPostScriptLanguageLevel : uint8_t
21{
22 L1 = 1,
23 L2 = 2,
24};
25
26struct PODOFO_API PdfTokenizerOptions final
27{
28 PdfPostScriptLanguageLevel LanguageLevel = PdfPostScriptLanguageLevel::L2;
29 bool ReadReferences = true;
30};
31
35class PODOFO_API PdfTokenizer
36{
37 PODOFO_PRIVATE_FRIEND(class PdfParserObject);
38
39public:
40 static constexpr unsigned BufferSize = 4096;
41
42public:
43 PdfTokenizer(const PdfTokenizerOptions& options = { });
44 PdfTokenizer(const std::shared_ptr<charbuff>& buffer, const PdfTokenizerOptions& options = { });
45
64 bool TryReadNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
65
72 bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
73
83 int64_t ReadNextNumber(InputStreamDevice& device);
84 bool TryReadNextNumber(InputStreamDevice& device, int64_t& value);
85
95 void ReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
96 bool TryReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
97
98protected:
99 // This enum differs from regular PdfDataType in the sense
100 // it enumerates only data types that can be determined literally
101 // by the tokenization and specify better if the strings literals
102 // are regular or hex strings
103 enum class PdfLiteralDataType : uint8_t
104 {
105 Unknown = 0,
106 Bool,
107 Number,
108 Real,
109 String,
110 HexString,
111 Name,
112 Array,
113 Dictionary,
114 Null,
115 Reference,
116 };
117
118protected:
129 void ReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
130 bool TryReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
131
141 void EnqueueToken(const std::string_view& token, PdfTokenType type);
142
149 void ReadDictionary(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
150
157 void ReadArray(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
158
165 void ReadString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
166
173 void ReadHexString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
174
182 void ReadName(InputStreamDevice& device, PdfVariant& variant);
183
190 PdfLiteralDataType DetermineDataType(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant);
191
192private:
193 bool tryReadDataType(InputStreamDevice& device, PdfLiteralDataType dataType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
194
195private:
196 using TokenizerPair = std::pair<std::string, PdfTokenType>;
197 using TokenizerQueque = std::deque<TokenizerPair>;
198
199private:
200 std::shared_ptr<charbuff> m_buffer;
201 PdfTokenizerOptions m_options;
202 TokenizerQueque m_tokenQueque;
203 charbuff m_charBuffer;
204};
205
206};
207
208#endif // PDF_TOKENIZER_H
SPDX-FileCopyrightText: (C) 2005 Dominik Seichter domseichter@web.de SPDX-FileCopyrightText: (C) 2020...
This class represents an input device It optionally supports peeking.
Definition InputDevice.h:22
A simple tokenizer for PDF files and PDF content streams.
Definition PdfTokenizer.h:36
bool TryReadNextToken(InputStreamDevice &device, std::string_view &token)
Reads the next token from the current file position ignoring all comments.
bool TryPeekNextToken(InputStreamDevice &device, std::string_view &token)
Try peek the next token from the current file position ignoring all comments, without actually consum...
void ReadNextVariant(InputStreamDevice &device, const std::string_view &token, PdfTokenType tokenType, PdfVariant &variant, const PdfStatefulEncrypt *encrypt)
Read the next variant from the current file position ignoring all comments.
A variant data type which supports all data types supported by the PDF standard.
Definition PdfVariant.h:33
Convenient type for char array storage and/or buffer with std::string compatibility.
Definition basetypes.h:38
SPDX-FileCopyrightText: (C) 2022 Francesco Pretto ceztko@gmail.com SPDX-License-Identifier: LGPL-2....
Definition basetypes.h:16