Commit 6bbfd2d7 authored by Tomáš Stefan's avatar Tomáš Stefan

Squashed commit of the following:

// It started as branch for adding UTF-8 support, but ended differently

commit 3615313dcba1e70a7fec91df1c2f022a99bfd09a
Author: Tomáš Stefan <ts@stdin.cz>
Date:   Sat Mar 10 21:06:23 2018 +0100

    read all xref sections

    add fn parse_word and replace with it other specific ones
    store all xref entries for one object (different generation number)
    add fn skip_array

commit 98ecb7ab1cf17465e295d47b1bb2f12d62c28d78
Author: Tomáš Stefan <ts@stdin.cz>
Date:   Thu Mar 8 17:11:18 2018 +0100

    CMake improvements, Build instructions

commit f85f88dbee46453b6475269274734998e85e470d
Author: Tomáš Stefan <ts@stdin.cz>
Date:   Wed Mar 7 16:56:02 2018 +0100

    Minor corrections

    - add missing semicolon on WIN
    - no colour on platforms other than unix
    - signed/unsigned comparison corrections
    - processing fread in loop

commit 2513508e6397af93d0d89783e56744105ef7546a
Author: Tomáš Stefan <ts@stdin.cz>
Date:   Wed Mar 7 14:36:15 2018 +0100

    working on Windows support

commit 05fdbc0c01a25af0c357659504a42dbb5c2b85c9
Author: Tomáš Stefan <ts@stdin.cz>
Date:   Wed Mar 7 00:24:58 2018 +0100

    UTF-8 paths, CMake support

    utf-8 path support
    CMake support
    open file on Windows - work in progress

commit 7e8f520d0b269eb566a6187ab0f2bd64a4b04925
Author: Tomáš Stefan <ts@stdin.cz>
Date:   Tue Mar 6 17:28:08 2018 +0100

    remove char_t, using UTF-8 narrow strings
parent 2f78f75d
......@@ -59,3 +59,6 @@ Module.symvers
Mkfile.old
dkms.conf
# Editor files
.idea
cmake-build-debug
cmake_minimum_required(VERSION 3.9)
project(pdf_sigil)
set(CMAKE_C_STANDARD 11)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -pedantic")
# header files
include_directories(include)
file(GLOB LIB_SRC "lib/*.c")
set (TEST_SRC "test/test.c")
# build both static and shared library
add_library(pdf-sigil_static STATIC ${LIB_SRC})
add_library(pdf-sigil_shared SHARED ${LIB_SRC})
# build selftest executable
add_executable(selftest ${TEST_SRC})
target_link_libraries(selftest pdf-sigil_static)
# running selftest
add_custom_target(run_tests ALL
COMMAND selftest
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_target(run_tests_verbose
COMMAND selftest --verbose
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
add_custom_target(run_tests_quiet
COMMAND selftest --quiet
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
\ No newline at end of file
......@@ -4,3 +4,21 @@ pdf-sigil
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
Word **sigil** in name stands for latin word *sigillum*, which means **seal** or **stamp**.
### Build
Performs build with output into "build" directory. After those steps, there will be a **static and shared library** and also **selftest executable**.
```shell
cmake -E make_directory build
cmake -E chdir build cmake ..
cmake -E chdir build make
```
The selftest is run automatically during the make, but if you want to re-run it, use one of the following commands:
```shell
cmake -E chdir build make run_tests # producing default output level
cmake -E chdir build make run_tests_verbose # verbose output level
cmake -E chdir build make run_tests_quiet # without output
```
......@@ -11,8 +11,8 @@
// Shouldn't be optimized out by the compiler
void sigil_zeroize(void *a, size_t bytes);
int is_digit(const char_t c);
int is_whitespace(const char_t c);
int is_digit(const char c);
int is_whitespace(const char c);
sigil_err_t pdf_read(sigil_t *sgl, size_t size, char *result, size_t *res_size);
sigil_err_t pdf_get_char(sigil_t *sgl, char *result);
......@@ -22,12 +22,12 @@ sigil_err_t pdf_move_pos_rel(sigil_t *sgl, ssize_t shift_bytes);
sigil_err_t pdf_move_pos_abs(sigil_t *sgl, size_t position);
sigil_err_t skip_leading_whitespaces(sigil_t *sgl);
sigil_err_t skip_array(sigil_t *sgl);
sigil_err_t skip_dictionary(sigil_t *sgl);
sigil_err_t skip_dict_unknown_value(sigil_t *sgl);
sigil_err_t parse_number(sigil_t *sgl, size_t *number);
sigil_err_t parse_keyword(sigil_t *sgl, keyword_t *keyword);
sigil_err_t parse_free_indicator(sigil_t *sgl, free_indicator_t *result);
sigil_err_t parse_word(sigil_t *sgl, const char *word);
sigil_err_t parse_indirect_reference(sigil_t *sgl, reference_t *ref);
sigil_err_t parse_dict_key(sigil_t *sgl, dict_key_t *dict_key);
......
......@@ -14,6 +14,10 @@
// threshold in bytes for loading whole file into buffer
#define THRESHOLD_FILE_BUFFERING 10485760
// maximum number of file updates, preventing forever loop in processing
// previous cross-reference sections (caused by cyclic links)
#define MAX_FILE_UPDATES 1024
// validate values
int sigil_config_self_test(int verbosity);
......
......@@ -2,21 +2,21 @@
#define PDF_SIGIL_CONSTANTS_H
#define COLOR_RED "\x1b[31m"
#define COLOR_GREEN "\x1b[32m"
#define COLOR_RESET "\x1b[0m"
#ifdef __unix__
#define COLOR_RED "\x1b[31m"
#define COLOR_GREEN "\x1b[32m"
#define COLOR_RESET "\x1b[0m"
#else
#define COLOR_RED ""
#define COLOR_GREEN ""
#define COLOR_RESET ""
#endif
#define KEYWORD_UNSET 0
#define KEYWORD_xref 1
#define KEYWORD_trailer 2
#define XREF_TYPE_UNSET 0
#define XREF_TYPE_TABLE 1
#define XREF_TYPE_STREAM 2
#define IN_USE_ENTRY 0
#define FREE_ENTRY 1
#define DICT_KEY_UNKNOWN 0
#define DICT_KEY_Size 1
#define DICT_KEY_Prev 2
......
......@@ -2,16 +2,18 @@
#define PDF_SIGIL_TYPES_H
#include <stdint.h> // uint32_t
#include <stdio.h>
typedef unsigned char char_t;
#ifdef _WIN32
#include <BaseTsd.h>
typedef SSIZE_T ssize_t;
#endif
typedef uint32_t sigil_err_t;
typedef uint32_t keyword_t;
typedef uint32_t free_indicator_t;
typedef uint32_t dict_key_t;
typedef struct {
......@@ -19,9 +21,10 @@ typedef struct {
size_t generation_num;
} reference_t;
typedef struct {
typedef struct xref_entry_t {
size_t byte_offset;
size_t generation_num;
struct xref_entry_t *next;
} xref_entry_t;
typedef struct {
......
This diff is collapsed.
......@@ -44,5 +44,6 @@ int sigil_config_self_test(int verbosity)
failed:
print_test_result(0, verbosity);
print_module_result(0, verbosity);
return 1;
}
......@@ -15,19 +15,13 @@ sigil_err_t process_header(sigil_t *sgl)
size_t read_size;
size_t pdf_x, pdf_y;
if (sgl == NULL)
return ERR_PARAMETER;
err = pdf_move_pos_abs(sgl, 0);
if (err != ERR_NO)
return err;
offset = 0;
while(1) {
if (offset > HEADER_SEARCH_OFFSET)
return ERR_PDF_CONTENT;
for (offset = 0; offset < HEADER_SEARCH_OFFSET; offset++) {
err = pdf_move_pos_abs(sgl, offset);
if (err != ERR_NO)
return err;
err = pdf_read(sgl, 5, tmp, &read_size);
if (err != ERR_NO)
......@@ -35,35 +29,32 @@ sigil_err_t process_header(sigil_t *sgl)
if (read_size != 5)
return ERR_PDF_CONTENT;
if (strncmp(tmp, "\x25PDF-", 5) == 0) {
if ((err = parse_number(sgl, &pdf_x)) != ERR_NO)
return err;
if (strncmp(tmp, "\x25PDF-", 5) != 0)
continue;
if ((err = pdf_get_char(sgl, &c)) != ERR_NO)
return err;
if (c != '.')
return ERR_PDF_CONTENT;
if ((err = parse_number(sgl, &pdf_y)) != ERR_NO)
return err;
if ((err = parse_number(sgl, &pdf_x)) != ERR_NO)
return err;
if ((pdf_x == 1 && pdf_y >= 0 && pdf_y <= 7) ||
(pdf_x == 2 && pdf_y == 0))
{
sgl->pdf_x = pdf_x;
sgl->pdf_y = pdf_y;
} else {
return ERR_PDF_CONTENT;
}
if ((err = pdf_get_char(sgl, &c)) != ERR_NO)
return err;
if (c != '.')
return ERR_PDF_CONTENT;
sgl->pdf_start_offset = offset;
if ((err = parse_number(sgl, &pdf_y)) != ERR_NO)
return err;
return ERR_NO;
if ((pdf_x == 1 && pdf_y >= 0 && pdf_y <= 7) ||
(pdf_x == 2 && pdf_y == 0))
{
sgl->pdf_x = (short)pdf_x;
sgl->pdf_y = (short)pdf_y;
} else {
return ERR_PDF_CONTENT;
}
if ((err = pdf_move_pos_rel(sgl, -4)) != ERR_NO)
return err;
offset++;
sgl->pdf_start_offset = offset;
return ERR_NO;
}
return ERR_PDF_CONTENT;
......
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "auxiliary.h"
#include "config.h"
#include "constants.h"
......@@ -40,7 +41,8 @@ sigil_err_t sigil_init(sigil_t **sgl)
sigil_err_t sigil_set_pdf_file(sigil_t *sgl, FILE *pdf_file)
{
size_t processed = 0;
size_t processed,
total_processed;
char *content = NULL;
if (sgl == NULL || pdf_file == NULL)
......@@ -57,7 +59,7 @@ sigil_err_t sigil_set_pdf_file(sigil_t *sgl, FILE *pdf_file)
return ERR_IO;
// - 2) read current position
sgl->pdf_data.size = ftell(sgl->pdf_data.file);
sgl->pdf_data.size = (size_t)(ftell(sgl->pdf_data.file) - 1);
if (sgl->pdf_data.size < 0)
return ERR_IO;
......@@ -72,14 +74,28 @@ sigil_err_t sigil_set_pdf_file(sigil_t *sgl, FILE *pdf_file)
return ERR_NO;
}
processed = fread(content, sgl->pdf_data.size,
sizeof(char), sgl->pdf_data.file);
if (processed != sgl->pdf_data.size) {
total_processed = 0;
while (total_processed * sizeof(char) < sgl->pdf_data.size) {
processed = fread(content + total_processed, sizeof(char),
sgl->pdf_data.size, sgl->pdf_data.file);
total_processed += processed;
if (processed <= 0 ||
total_processed * sizeof(char) > sgl->pdf_data.size)
{
// fallback to using the file
free(content);
return ERR_NO;
}
}
if (total_processed * sizeof(char) != sgl->pdf_data.size) {
// fallback to using the file
free(content);
return ERR_NO;
}
content[processed] = '\0';
content[total_processed] = '\0';
sgl->pdf_data.buffer = content;
sgl->pdf_data.deallocation_info |= DEALLOCATE_BUFFER;
......@@ -95,8 +111,39 @@ sigil_err_t sigil_set_pdf_path(sigil_t *sgl, const char *path_to_pdf)
FILE *pdf_file = NULL;
if ((pdf_file = fopen(path_to_pdf, "r")) == NULL)
return ERR_IO;
#ifdef _WIN32
// convert path to wchar_t
size_t out_size;
size_t path_len;
wchar_t *path_to_pdf_win;
path_len = strlen(path_to_pdf) + 1;
path_to_pdf_win = malloc(path_len * sizeof(wchar_t));
if (path_to_pdf_win == NULL)
return ERR_ALLOCATION;
sigil_zeroize(path_to_pdf_win, path_len * sizeof(wchar_t));
if (mbstowcs_s(&out_size, // out ... characters converted
path_to_pdf_win, // out ... converted string
path_len, // in ... size of path_to_pdf_win
path_to_pdf, // in ... input string
path_len - 1 // in ... max wide chars to store
) != 0)
{
free(path_to_pdf_win);
return ERR_IO;
}
if (_wfopen_s(&pdf_file, path_to_pdf_win, L"rb") != 0) {
free(path_to_pdf_win);
return ERR_IO;
}
free(path_to_pdf_win);
#else
if ((pdf_file = fopen(path_to_pdf, "rb")) == NULL)
return ERR_IO;
#endif
sgl->pdf_data.deallocation_info |= DEALLOCATE_FILE;
return sigil_set_pdf_file(sgl, pdf_file);
......@@ -126,14 +173,37 @@ sigil_err_t sigil_verify(sigil_t *sgl)
if (err != ERR_NO)
return err;
// process cross-reference section
err = process_xref(sgl);
// determine offset to the first cross-reference section
err = read_startxref(sgl);
if (err != ERR_NO)
return err;
err = process_trailer(sgl);
if (err != ERR_NO)
return err;
if (sgl->xref != NULL)
xref_free(sgl->xref);
sgl->xref = xref_init();
if (sgl->xref == NULL)
return ERR_ALLOCATION;
sgl->xref->prev_section = sgl->startxref;
size_t max_file_updates = MAX_FILE_UPDATES;
while (sgl->xref->prev_section > 0 && (max_file_updates--) > 0) {
// go to the position of the beginning of next cross-reference section
err = pdf_move_pos_abs(sgl, sgl->xref->prev_section);
if (err != ERR_NO)
return err;
sgl->xref->prev_section = 0;
err = process_xref(sgl);
if (err != ERR_NO)
return err;
err = process_trailer(sgl);
if (err != ERR_NO)
return err;
}
// TODO
......@@ -186,15 +256,16 @@ int sigil_sigil_self_test(int verbosity)
print_test_item("fn sigil_verify", verbosity);
{
sgl = NULL;
err = sigil_init(&sgl);
if (err != ERR_NO || sgl == NULL)
sgl = test_prepare_sgl_path(
"test/uznavany_bez_razitka_bez_revinfo_27_2_2012_CMS.pdf");
if (sgl == NULL)
goto failed;
// TODO
if (1)
if (sigil_verify(sgl) != ERR_NO || 1)
goto failed;
// TODO test verification result
sigil_free(&sgl);
}
......@@ -202,6 +273,7 @@ int sigil_sigil_self_test(int verbosity)
// all tests done
print_module_result(1, verbosity);
return 0;
failed:
......@@ -210,5 +282,6 @@ failed:
print_test_result(0, verbosity);
print_module_result(0, verbosity);
return 1;
}
#include <stdio.h>
#include "auxiliary.h"
#include "constants.h"
#include "trailer.h"
......@@ -5,32 +6,27 @@
sigil_err_t process_trailer(sigil_t *sgl)
{
sigil_err_t err;
keyword_t keyword;
dict_key_t dict_key;
char c;
if (sgl == NULL)
return ERR_PARAMETER;
// read "trailer"
err = parse_keyword(sgl, &keyword);
err = parse_word(sgl, "trailer");
if (err != ERR_NO)
return err;
if (keyword != KEYWORD_trailer)
return ERR_PDF_CONTENT;
err = skip_leading_whitespaces(sgl);
err = parse_word(sgl, "<<");
if (err != ERR_NO)
return err;
if ((pdf_get_char(sgl, &c)) != ERR_NO || c != '<')
return ERR_PDF_CONTENT;
if ((pdf_get_char(sgl, &c)) != ERR_NO || c != '<')
return ERR_PDF_CONTENT;
while ((err = parse_dict_key(sgl, &dict_key)) == ERR_NO) {
switch (dict_key) {
case DICT_KEY_Size:
err = parse_number(sgl, &(sgl->xref->size_from_trailer));
if (sgl->xref->size_from_trailer > 0) {
err = skip_dict_unknown_value(sgl);
} else {
err = parse_number(sgl, &(sgl->xref->size_from_trailer));
}
if (err != ERR_NO)
return err;
break;
......@@ -40,7 +36,13 @@ sigil_err_t process_trailer(sigil_t *sgl)
return err;
break;
case DICT_KEY_Root:
err = parse_indirect_reference(sgl, &(sgl->ref_catalog_dict));
if (sgl->ref_catalog_dict.object_num > 0 ||
sgl->ref_catalog_dict.generation_num > 0)
{
err = skip_dict_unknown_value(sgl);
} else {
err = parse_indirect_reference(sgl, &(sgl->ref_catalog_dict));
}
if (err != ERR_NO)
return err;
break;
......@@ -54,6 +56,9 @@ sigil_err_t process_trailer(sigil_t *sgl)
}
}
if (err == ERR_END_OF_DICT)
return ERR_NO;
return err;
}
......@@ -73,10 +78,12 @@ int sigil_trailer_self_test(int verbosity)
// all tests done
print_module_result(1, verbosity);
return 0;
failed:
print_test_result(0, verbosity);
print_module_result(0, verbosity);
return 1;
}
......@@ -12,10 +12,6 @@ static sigil_err_t determine_xref_type(sigil_t *sgl)
sigil_err_t err;
char c;
err = pdf_move_pos_abs(sgl, sgl->startxref);
if (err != ERR_NO)
return err;
if ((err = pdf_peek_char(sgl, &c)) != ERR_NO)
return err;
......@@ -51,30 +47,32 @@ add_xref_entry(xref_t *xref, size_t obj, size_t offset, size_t generation)
xref->capacity *= resize_factor;
}
if (xref->entry[obj] != NULL) {
if (xref->entry[obj]->generation_num < generation) {
xref->entry[obj]->byte_offset = offset;
xref->entry[obj]->generation_num = generation;
} else {
xref_entry_t **xref_entry = &(xref->entry[obj]);
while (*xref_entry != NULL) {
if ((*xref_entry)->generation_num == generation)
return ERR_NO;
}
} else {
xref->entry[obj] = malloc(sizeof(xref_entry_t));
if (xref->entry[obj] == NULL)
return ERR_ALLOCATION;
sigil_zeroize(xref->entry[obj], sizeof(xref->entry[obj]));
xref->entry[obj]->byte_offset = offset;
xref->entry[obj]->generation_num = generation;
xref_entry = &(*xref_entry)->next;
}
return ERR_ALLOCATION;
*xref_entry = malloc(sizeof(xref_entry_t));
if (*xref_entry == NULL)
return ERR_ALLOCATION;
sigil_zeroize(*xref_entry, sizeof(xref_entry_t));
(*xref_entry)->byte_offset = offset;
(*xref_entry)->generation_num = generation;
return ERR_NO;
}
static void free_xref_entry(xref_entry_t *entry)
{
if (entry != NULL)
if (entry != NULL) {
free_xref_entry(entry->next);
free(entry);
}
}
xref_t *xref_init()
......@@ -82,14 +80,14 @@ xref_t *xref_init()
xref_t *xref = malloc(sizeof(xref_t));
if (xref == NULL)
return NULL;
sigil_zeroize(xref, sizeof(xref));
sigil_zeroize(xref, sizeof(*xref));
xref->entry = malloc(sizeof(xref_entry_t *) * XREF_PREALLOCATION);
if (xref->entry == NULL) {
free(xref);
return NULL;
}
sigil_zeroize(xref->entry, sizeof(*(xref->entry)) * xref->capacity);
sigil_zeroize(xref->entry, sizeof(*(xref->entry)) * XREF_PREALLOCATION);
xref->capacity = XREF_PREALLOCATION;
xref->size_from_trailer = 0;
xref->prev_section = 0;
......@@ -103,7 +101,7 @@ void xref_free(xref_t *xref)
return;
if (xref->entry != NULL) {
for (int i = 0; i < xref->capacity; i++) {
for (size_t i = 0; i < xref->capacity; i++) {
free_xref_entry(xref->entry[i]);
}
free(xref->entry);
......@@ -159,29 +157,21 @@ sigil_err_t read_startxref(sigil_t *sgl)
sigil_err_t read_xref_table(sigil_t *sgl)
{
free_indicator_t free_indicator;
size_t section_start = 0,
section_cnt = 0,
obj_offset,
obj_generation;
int xref_end = 0;
keyword_t keyword;
sigil_err_t err;
if (sgl->xref == NULL)
if (sgl->xref == NULL) {
sgl->xref = xref_init();
if (sgl->xref == NULL)
return ERR_ALLOCATION;
err = pdf_move_pos_abs(sgl, sgl->startxref);
if (err != ERR_NO)
return err;
if (sgl->xref == NULL)
return ERR_ALLOCATION;
}
// read "xref"
if ((err = parse_keyword(sgl, &keyword)) != ERR_NO)
if ((err = parse_word(sgl, "xref")) != ERR_NO)
return err;
if (keyword != KEYWORD_xref)
return ERR_PDF_CONTENT;
while (!xref_end) { // for all xref sections
while (1) {
......@@ -198,21 +188,26 @@ sigil_err_t read_xref_table(sigil_t *sgl)
return 1;
// for all entries in one section
for (int section_offset = 0; section_offset < section_cnt; section_offset++) {
for (size_t section_offset = 0; section_offset < section_cnt; section_offset++) {
err = parse_number(sgl, &obj_offset);
if (err != ERR_NO)
return err;
err = parse_number(sgl, &obj_generation);
if (err != ERR_NO)
return err;
err = parse_free_indicator(sgl, &free_indicator);
if (parse_word(sgl, "f") == ERR_NO)
continue;
err = parse_word(sgl, "n");
if (err != ERR_NO)
return err;
size_t obj_num = section_start + section_offset;
if (free_indicator == IN_USE_ENTRY) {
err = add_xref_entry(sgl->xref, obj_num, obj_offset, obj_generation); if (err != ERR_NO)
return err;
}