PoDoFo 1.0.0-dev
Loading...
Searching...
No Matches
PdfTokenizer.h
1
7#ifndef PDF_TOKENIZER_H
8#define PDF_TOKENIZER_H
9
10#include "PdfDeclarations.h"
11#include <podofo/auxiliary/InputDevice.h>
12#include "PdfStatefulEncrypt.h"
13
14#include <deque>
15
16namespace PoDoFo {
17
18class PdfVariant;
19
20enum class PdfPostScriptLanguageLevel : uint8_t
21{
22 L1 = 1,
23 L2 = 2,
24};
25
26struct PODOFO_API PdfTokenizerOptions final
27{
28 PdfPostScriptLanguageLevel LanguageLevel = PdfPostScriptLanguageLevel::L2;
29 bool ReadReferences = true;
30};
31
35class PODOFO_API PdfTokenizer
36{
37 friend class PdfPostScriptTokenizer;
38 PODOFO_PRIVATE_FRIEND(class PdfParserObject);
39
40public:
41 static constexpr unsigned BufferSize = 4096;
42
43public:
44 PdfTokenizer(const PdfTokenizerOptions& options = { });
45 PdfTokenizer(std::shared_ptr<charbuff> buffer, const PdfTokenizerOptions& options = { });
46
65 bool TryReadNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
66
73 bool TryPeekNextToken(InputStreamDevice& device, std::string_view& token, PdfTokenType& tokenType);
74
84 int64_t ReadNextNumber(InputStreamDevice& device);
85 bool TryReadNextNumber(InputStreamDevice& device, int64_t& value);
86
96 void ReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
97 bool TryReadNextVariant(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt = { });
98
99protected:
100 // This enum differs from regular PdfDataType in the sense
101 // it enumerates only data types that can be determined literally
102 // by the tokenization and specify better if the strings literals
103 // are regular or hex strings
104 enum class PdfLiteralDataType : uint8_t
105 {
106 Unknown = 0,
107 Bool,
108 Number,
109 Real,
110 String,
111 HexString,
112 Name,
113 Array,
114 Dictionary,
115 Null,
116 Reference,
117 };
118
119protected:
130 void ReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
131 bool TryReadNextVariant(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
132
142 void EnqueueToken(const std::string_view& token, PdfTokenType type);
143
150 void ReadDictionary(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
151
158 void ReadArray(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
159
166 void ReadString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
167
174 void ReadHexString(InputStreamDevice& device, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
175
183 void ReadName(InputStreamDevice& device, PdfVariant& variant);
184
191 PdfLiteralDataType DetermineDataType(InputStreamDevice& device, const std::string_view& token, PdfTokenType tokenType, PdfVariant& variant);
192
193private:
194 PdfTokenizer(std::in_place_t, std::shared_ptr<charbuff>&& buffer, const PdfTokenizerOptions& options);
195 bool tryReadDataType(InputStreamDevice& device, PdfLiteralDataType dataType, PdfVariant& variant, const PdfStatefulEncrypt* encrypt);
196
197private:
198 using TokenizerPair = std::pair<std::string, PdfTokenType>;
199 using TokenizerQueque = std::deque<TokenizerPair>;
200
201private:
202 std::shared_ptr<charbuff> m_buffer;
203 PdfTokenizerOptions m_options;
204 TokenizerQueque m_tokenQueque;
205 charbuff m_charBuffer;
206};
207
208};
209
210#endif // PDF_TOKENIZER_H
SPDX-FileCopyrightText: (C) 2005 Dominik Seichter domseichter@web.de SPDX-FileCopyrightText: (C) 2020...
This class represents an input device It optionally supports peeking.
Definition InputDevice.h:22
This class is a parser for general PostScript content in PDF documents.
Definition PdfPostScriptTokenizer.h:30
A simple tokenizer for PDF files and PDF content streams.
Definition PdfTokenizer.h:36
bool TryReadNextToken(InputStreamDevice &device, std::string_view &token)
Reads the next token from the current file position ignoring all comments.
bool TryPeekNextToken(InputStreamDevice &device, std::string_view &token)
Try peek the next token from the current file position ignoring all comments, without actually consum...
void ReadNextVariant(InputStreamDevice &device, const std::string_view &token, PdfTokenType tokenType, PdfVariant &variant, const PdfStatefulEncrypt *encrypt)
Read the next variant from the current file position ignoring all comments.
A variant data type which supports all data types supported by the PDF standard.
Definition PdfVariant.h:33
Convenient type for char array storage and/or buffer with std::string compatibility.
Definition basetypes.h:38
SPDX-FileCopyrightText: (C) 2022 Francesco Pretto ceztko@gmail.com SPDX-License-Identifier: LGPL-2....
Definition basetypes.h:16