pdf2htmlEX/src/HTMLRenderer/text.cc at 9dbd5044a4515332ffbdb8211bd85bb36ac491ef · coolwanglu/pdf2htmlEX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/*
 * text.cc
 *
 * Handling text & font, and relative stuffs
 *
 * Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
 */


#include <algorithm>

#include "HTMLRenderer.h"

#include "util/namespace.h"
#include "util/unicode.h"

//#define HR_DEBUG(x)  (x)
#define HR_DEBUG(x)

namespace pdf2htmlEX {

using std::none_of;
using std::cerr;
using std::endl;

void HTMLRenderer::drawString(GfxState * state, GooString * s)
{
    if(s->getLength() == 0)
        return;

    auto font = state->getFont();
    double cur_letter_space = state->getCharSpace();
    double cur_word_space   = state->getWordSpace();
    double cur_horiz_scaling = state->getHorizScaling();


    // Writing mode fonts and Type 3 fonts are rendered as images
    // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
    // For type 3 fonts, due to the font matrix, still it's hard to show it on HTML
    if( (font == nullptr)
        || (font->getWMode())
        || ((font->getType() == fontType3) && (!param.process_type3))
      )
    {
        return;
    }

    // see if the line has to be closed due to state change
    check_state_change(state);
    prepare_text_line(state);

    // Now ready to output
    // get the unicodes
    char *p = s->getCString();
    int len = s->getLength();

    //accumulated displacement of chars in this string, in text object space
    double dx = 0;
    double dy = 0;
    //displacement of current char, in text object space, including letter space but not word space.
    double ddx, ddy;
    //advance of current char, in glyph space
    double ax, ay;
    //origin of current char, in glyph space
    double ox, oy;

    int uLen;

    CharCode code;
    Unicode *u = nullptr;

    HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len));

    while (len > 0)
    {
        auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
        HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)%s\n", (wchar_t)u[0], u[0], has_glyph(code, font) ? "":" no glyph"));

        if(!(equal(ox, 0) && equal(oy, 0)))
        {
            cerr << "TODO: non-zero origins" << endl;
        }
        ddx = ax * cur_font_size + cur_letter_space;
        ddy = ay * cur_font_size;
        tracer.draw_char(state, dx, dy, ax, ay);

        bool is_space = false;
        if (n == 1 && *p == ' ')
        {
            /*
             * This is by standard
             * however some PDF will use ' ' as a normal encoding slot
             * such that it will be mapped to other unicodes
             * In that case, when space_as_offset is on, we will simply ignore that character...
             *
             * Checking mapped unicode may or may not work
             * There are always ugly PDF files with no useful info at all.
             */
            is_space = true;
        }

        if(is_space && (param.space_as_offset))
        {
            html_text_page.get_cur_line()->append_padding_char();
            // ignore horiz_scaling, as it has been merged into CTM
            html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
        }
        else
        {
            if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode))
            {
                html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
            }
            else
            {
                if (uLen == 1 && is_illegal_unicode(u[0]) && !has_glyph(code, font))
                {
                    // Convert illegal html unicode to a whitespace, if it has no glyph.
                    // Add a zero-width space AFTER the offset to make sure words are
                    // delimited, and make sure the ZWSP can be optimized out if the
                    // offset is represented by a space (see HTMLTextLine::dump_unicode).
                    html_text_page.get_cur_line()->append_offset(ddx * draw_text_scale);
                    html_text_page.get_cur_line()->append_unicodes(&zero_width_space, 1, 0);
                }
                else
                {
                    Unicode uu;
                    if(cur_text_state.font_info->use_tounicode)
                    {
                        uu = check_unicode(u, uLen, code, font);
                    }
                    else
                    {
                        uu = unicode_from_font(code, font);
                    }
                    html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
                    /*
                     * In PDF, word_space is appended if (n == 1 and *p = ' ')
                     * but in HTML, word_space is appended if (uu == ' ')
                     */
                    int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
                    if(space_count != 0)
                    {
                        html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
                    }
                }
            }
        }

        dx += ddx * cur_horiz_scaling;
        dy += ddy;
        if (is_space)
            dx += cur_word_space * cur_horiz_scaling;

        p += n;
        len -= n;
    }

    cur_tx += dx;
    cur_ty += dy;

    draw_tx += dx;
    draw_ty += dy;
}

bool HTMLRenderer::is_char_covered(int index)
{
    auto covered = covered_text_detector.get_chars_covered();
    if (index < 0 || index >= (int)covered.size())
    {
        std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
                << index << ", size: " << covered.size() <<endl;
        return false;
    }
    return covered[index];
}

} // namespace pdf2htmlEX