691 lines
13 KiB
Go
691 lines
13 KiB
Go
package main
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/pdfcpu/pdfcpu/pkg/api"
|
|
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
|
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
|
|
"rsc.io/pdf"
|
|
)
|
|
|
|
const (
|
|
paddingPoints = 6.0
|
|
minSidePoints = 1.0
|
|
)
|
|
|
|
type rect struct {
|
|
llx float64
|
|
lly float64
|
|
urx float64
|
|
ury float64
|
|
}
|
|
|
|
type pageCrop struct {
|
|
crop rect
|
|
apply bool
|
|
}
|
|
|
|
type matrix [3][3]float64
|
|
|
|
var identity = matrix{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintf(os.Stderr, "Pouzitie: %s subor.pdf\n", filepath.Base(os.Args[0]))
|
|
os.Exit(1)
|
|
}
|
|
|
|
input := os.Args[1]
|
|
if strings.ToLower(filepath.Ext(input)) != ".pdf" {
|
|
fmt.Fprintln(os.Stderr, "Chyba: prvy argument musi byt PDF subor.")
|
|
os.Exit(1)
|
|
}
|
|
|
|
if _, err := os.Stat(input); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Chyba: subor %q neexistuje alebo sa neda citat (%v).\n", input, err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
output := trimmedPath(input)
|
|
if err := trimPDF(input, output); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Chyba: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
fmt.Printf("Hotovo: %s\n", output)
|
|
}
|
|
|
|
func trimPDF(input, output string) error {
|
|
reader, err := pdf.Open(input)
|
|
if err != nil {
|
|
return fmt.Errorf("nepodarilo sa otvorit PDF: %w", err)
|
|
}
|
|
|
|
pageCount := reader.NumPage()
|
|
if pageCount == 0 {
|
|
return errors.New("PDF nema ziadne strany")
|
|
}
|
|
|
|
plan := make([]pageCrop, pageCount)
|
|
for i := 1; i <= pageCount; i++ {
|
|
p := reader.Page(i)
|
|
if p.V.IsNull() {
|
|
continue
|
|
}
|
|
|
|
pageBox, err := effectivePageBox(p.V)
|
|
if err != nil {
|
|
return fmt.Errorf("strana %d: %w", i, err)
|
|
}
|
|
|
|
contentBox, found := detectContentBox(p)
|
|
if !found {
|
|
continue
|
|
}
|
|
|
|
contentBox = contentBox.expand(paddingPoints).clamp(pageBox)
|
|
if contentBox.width() < minSidePoints || contentBox.height() < minSidePoints {
|
|
continue
|
|
}
|
|
|
|
if nearlyEqualRect(contentBox, pageBox, 0.01) {
|
|
continue
|
|
}
|
|
|
|
plan[i-1] = pageCrop{crop: contentBox, apply: true}
|
|
}
|
|
|
|
if err := copyFile(input, output); err != nil {
|
|
return fmt.Errorf("nepodarilo sa pripravit vystupny subor: %w", err)
|
|
}
|
|
|
|
for i, item := range plan {
|
|
if !item.apply {
|
|
continue
|
|
}
|
|
|
|
boxDef := item.crop.boxDef()
|
|
cropBox, err := model.ParseBox(boxDef, types.POINTS)
|
|
if err != nil {
|
|
return fmt.Errorf("strana %d: neplatny crop box %s: %w", i+1, boxDef, err)
|
|
}
|
|
|
|
pageSel := []string{strconv.Itoa(i + 1)}
|
|
if err := api.CropFile(output, output, pageSel, cropBox, nil); err != nil {
|
|
return fmt.Errorf("strana %d: crop zlyhal: %w", i+1, err)
|
|
}
|
|
|
|
pageBounds, err := api.PageBoundaries("media:"+boxDef, types.POINTS)
|
|
if err != nil {
|
|
return fmt.Errorf("strana %d: nepodarilo sa pripravit media box: %w", i+1, err)
|
|
}
|
|
if err := api.AddBoxesFile(output, output, pageSel, pageBounds, nil); err != nil {
|
|
return fmt.Errorf("strana %d: nastavenie media box zlyhalo: %w", i+1, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func detectContentBox(p pdf.Page) (rect, bool) {
|
|
var box rect
|
|
found := false
|
|
|
|
add := func(r rect) {
|
|
r = r.normalize()
|
|
if !r.valid() {
|
|
return
|
|
}
|
|
if !found {
|
|
box = r
|
|
found = true
|
|
return
|
|
}
|
|
box = box.union(r)
|
|
}
|
|
|
|
if tb, ok := detectTextBox(p); ok {
|
|
add(tb)
|
|
}
|
|
|
|
if gb, ok := detectGraphicsBox(p); ok {
|
|
add(gb)
|
|
}
|
|
|
|
return box, found
|
|
}
|
|
|
|
type textState struct {
|
|
Tc float64
|
|
Tw float64
|
|
Th float64
|
|
Tl float64
|
|
Tf pdf.Font
|
|
Tfs float64
|
|
Trise float64
|
|
Tm matrix
|
|
Tlm matrix
|
|
CTM matrix
|
|
}
|
|
|
|
func detectTextBox(p pdf.Page) (rect, bool) {
|
|
var box rect
|
|
found := false
|
|
|
|
add := func(r rect) {
|
|
r = r.normalize()
|
|
if !r.valid() {
|
|
return
|
|
}
|
|
if !found {
|
|
box = r
|
|
found = true
|
|
return
|
|
}
|
|
box = box.union(r)
|
|
}
|
|
|
|
g := textState{
|
|
Th: 1,
|
|
CTM: identity,
|
|
Tm: identity,
|
|
Tlm: identity,
|
|
}
|
|
|
|
var enc pdf.TextEncoding
|
|
var gstack []textState
|
|
|
|
showText := func(raw string) {
|
|
decoded := raw
|
|
if enc != nil {
|
|
decoded = enc.Decode(raw)
|
|
}
|
|
|
|
rawBytes := []byte(raw)
|
|
i := 0
|
|
|
|
for _, ch := range decoded {
|
|
trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM)
|
|
|
|
w0 := 500.0
|
|
if i < len(rawBytes) && !g.Tf.V.IsNull() {
|
|
w0 = g.Tf.Width(int(rawBytes[i]))
|
|
}
|
|
if i < len(rawBytes) {
|
|
i++
|
|
}
|
|
|
|
charWidth := math.Abs(w0 / 1000 * trm[0][0])
|
|
h := math.Abs(g.Tfs)
|
|
if h <= 0 {
|
|
h = 8
|
|
}
|
|
|
|
if !unicodeIsSpace(ch) {
|
|
add(rect{
|
|
llx: trm[2][0],
|
|
lly: trm[2][1] - 0.30*h,
|
|
urx: trm[2][0] + math.Max(charWidth, 0.2*h),
|
|
ury: trm[2][1] + 0.90*h,
|
|
})
|
|
}
|
|
|
|
tx := w0/1000*g.Tfs + g.Tc
|
|
if ch == ' ' {
|
|
tx += g.Tw
|
|
}
|
|
tx *= g.Th
|
|
|
|
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
|
|
}
|
|
}
|
|
|
|
if !interpretPageContent(p, func(stk *pdf.Stack, op string) {
|
|
args := popArgs(stk)
|
|
|
|
switch op {
|
|
case "q":
|
|
gstack = append(gstack, g)
|
|
case "Q":
|
|
if len(gstack) == 0 {
|
|
return
|
|
}
|
|
g = gstack[len(gstack)-1]
|
|
gstack = gstack[:len(gstack)-1]
|
|
case "cm":
|
|
m, ok := matrixFromArgs(args)
|
|
if ok {
|
|
g.CTM = m.mul(g.CTM)
|
|
}
|
|
case "BT":
|
|
g.Tm = identity
|
|
g.Tlm = identity
|
|
case "T*":
|
|
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
|
|
g.Tlm = x.mul(g.Tlm)
|
|
g.Tm = g.Tlm
|
|
case "Tc":
|
|
if len(args) == 1 {
|
|
g.Tc = args[0].Float64()
|
|
}
|
|
case "TD":
|
|
if len(args) != 2 {
|
|
return
|
|
}
|
|
g.Tl = -args[1].Float64()
|
|
fallthrough
|
|
case "Td":
|
|
if len(args) != 2 {
|
|
return
|
|
}
|
|
tx := args[0].Float64()
|
|
ty := args[1].Float64()
|
|
x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}}
|
|
g.Tlm = x.mul(g.Tlm)
|
|
g.Tm = g.Tlm
|
|
case "Tf":
|
|
if len(args) != 2 {
|
|
return
|
|
}
|
|
g.Tf = p.Font(args[0].Name())
|
|
g.Tfs = args[1].Float64()
|
|
enc = nil
|
|
if !g.Tf.V.IsNull() {
|
|
enc = g.Tf.Encoder()
|
|
}
|
|
case "\"":
|
|
if len(args) != 3 {
|
|
return
|
|
}
|
|
g.Tw = args[0].Float64()
|
|
g.Tc = args[1].Float64()
|
|
showText(args[2].RawString())
|
|
case "'":
|
|
if len(args) != 1 {
|
|
return
|
|
}
|
|
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
|
|
g.Tlm = x.mul(g.Tlm)
|
|
g.Tm = g.Tlm
|
|
showText(args[0].RawString())
|
|
case "Tj":
|
|
if len(args) == 1 {
|
|
showText(args[0].RawString())
|
|
}
|
|
case "TJ":
|
|
if len(args) != 1 {
|
|
return
|
|
}
|
|
v := args[0]
|
|
for i := 0; i < v.Len(); i++ {
|
|
x := v.Index(i)
|
|
if x.Kind() == pdf.String {
|
|
showText(x.RawString())
|
|
continue
|
|
}
|
|
tx := -x.Float64() / 1000 * g.Tfs * g.Th
|
|
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
|
|
}
|
|
case "TL":
|
|
if len(args) == 1 {
|
|
g.Tl = args[0].Float64()
|
|
}
|
|
case "Tm":
|
|
m, ok := matrixFromArgs(args)
|
|
if ok {
|
|
g.Tm = m
|
|
g.Tlm = m
|
|
}
|
|
case "Ts":
|
|
if len(args) == 1 {
|
|
g.Trise = args[0].Float64()
|
|
}
|
|
case "Tw":
|
|
if len(args) == 1 {
|
|
g.Tw = args[0].Float64()
|
|
}
|
|
case "Tz":
|
|
if len(args) == 1 {
|
|
g.Th = args[0].Float64() / 100
|
|
}
|
|
}
|
|
}) {
|
|
return rect{}, false
|
|
}
|
|
|
|
return box, found
|
|
}
|
|
|
|
func detectGraphicsBox(p pdf.Page) (rect, bool) {
|
|
var box rect
|
|
found := false
|
|
|
|
add := func(r rect) {
|
|
r = r.normalize()
|
|
if !r.valid() {
|
|
return
|
|
}
|
|
if !found {
|
|
box = r
|
|
found = true
|
|
return
|
|
}
|
|
box = box.union(r)
|
|
}
|
|
|
|
xObjects := p.Resources().Key("XObject")
|
|
|
|
ctm := identity
|
|
var ctmStack []matrix
|
|
|
|
if !interpretPageContent(p, func(stk *pdf.Stack, op string) {
|
|
args := popArgs(stk)
|
|
|
|
switch op {
|
|
case "q":
|
|
ctmStack = append(ctmStack, ctm)
|
|
|
|
case "Q":
|
|
if len(ctmStack) == 0 {
|
|
return
|
|
}
|
|
ctm = ctmStack[len(ctmStack)-1]
|
|
ctmStack = ctmStack[:len(ctmStack)-1]
|
|
|
|
case "cm":
|
|
m, ok := matrixFromArgs(args)
|
|
if ok {
|
|
ctm = m.mul(ctm)
|
|
}
|
|
|
|
case "re":
|
|
if len(args) != 4 {
|
|
return
|
|
}
|
|
x := args[0].Float64()
|
|
y := args[1].Float64()
|
|
w := args[2].Float64()
|
|
h := args[3].Float64()
|
|
add(transformedRect(ctm, x, y, x+w, y+h))
|
|
|
|
case "Do":
|
|
if len(args) != 1 {
|
|
return
|
|
}
|
|
|
|
name := args[0].Name()
|
|
if name == "" || xObjects.IsNull() {
|
|
return
|
|
}
|
|
|
|
xobj := xObjects.Key(name)
|
|
subtype := xobj.Key("Subtype").Name()
|
|
if subtype == "Image" || subtype == "Form" {
|
|
add(transformedRect(ctm, 0, 0, 1, 1))
|
|
}
|
|
}
|
|
}) {
|
|
return rect{}, false
|
|
}
|
|
|
|
return box, found
|
|
}
|
|
|
|
func interpretPageContent(p pdf.Page, fn func(stk *pdf.Stack, op string)) bool {
|
|
contents := p.V.Key("Contents")
|
|
if contents.IsNull() {
|
|
return false
|
|
}
|
|
|
|
ok := false
|
|
run := func(stream pdf.Value) {
|
|
if safeInterpretStream(stream, fn) {
|
|
ok = true
|
|
}
|
|
}
|
|
|
|
switch contents.Kind() {
|
|
case pdf.Stream:
|
|
run(contents)
|
|
case pdf.Array:
|
|
for i := 0; i < contents.Len(); i++ {
|
|
stream := contents.Index(i)
|
|
if stream.IsNull() || stream.Kind() != pdf.Stream {
|
|
continue
|
|
}
|
|
run(stream)
|
|
}
|
|
}
|
|
|
|
return ok
|
|
}
|
|
|
|
func safeInterpretStream(stream pdf.Value, fn func(stk *pdf.Stack, op string)) (ok bool) {
|
|
if stream.IsNull() || stream.Kind() != pdf.Stream {
|
|
return false
|
|
}
|
|
|
|
defer func() {
|
|
if recover() != nil {
|
|
ok = false
|
|
}
|
|
}()
|
|
|
|
pdf.Interpret(stream, fn)
|
|
return true
|
|
}
|
|
|
|
func unicodeIsSpace(r rune) bool {
|
|
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func popArgs(stk *pdf.Stack) []pdf.Value {
|
|
n := stk.Len()
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
|
|
args := make([]pdf.Value, n)
|
|
for i := n - 1; i >= 0; i-- {
|
|
args[i] = stk.Pop()
|
|
}
|
|
|
|
return args
|
|
}
|
|
|
|
func matrixFromArgs(args []pdf.Value) (matrix, bool) {
|
|
if len(args) != 6 {
|
|
return matrix{}, false
|
|
}
|
|
|
|
var m matrix
|
|
for i := 0; i < 6; i++ {
|
|
m[i/2][i%2] = args[i].Float64()
|
|
}
|
|
m[2][2] = 1
|
|
|
|
return m, true
|
|
}
|
|
|
|
func transformedRect(m matrix, x0, y0, x1, y1 float64) rect {
|
|
p1x, p1y := transformPoint(m, x0, y0)
|
|
p2x, p2y := transformPoint(m, x1, y0)
|
|
p3x, p3y := transformPoint(m, x0, y1)
|
|
p4x, p4y := transformPoint(m, x1, y1)
|
|
|
|
minX := math.Min(math.Min(p1x, p2x), math.Min(p3x, p4x))
|
|
minY := math.Min(math.Min(p1y, p2y), math.Min(p3y, p4y))
|
|
maxX := math.Max(math.Max(p1x, p2x), math.Max(p3x, p4x))
|
|
maxY := math.Max(math.Max(p1y, p2y), math.Max(p3y, p4y))
|
|
|
|
return rect{llx: minX, lly: minY, urx: maxX, ury: maxY}
|
|
}
|
|
|
|
func transformPoint(m matrix, x, y float64) (float64, float64) {
|
|
px := x*m[0][0] + y*m[1][0] + m[2][0]
|
|
py := x*m[0][1] + y*m[1][1] + m[2][1]
|
|
return px, py
|
|
}
|
|
|
|
func effectivePageBox(page pdf.Value) (rect, error) {
|
|
if r, ok := inheritedRect(page, "CropBox"); ok {
|
|
return r, nil
|
|
}
|
|
if r, ok := inheritedRect(page, "MediaBox"); ok {
|
|
return r, nil
|
|
}
|
|
return rect{}, errors.New("chybajuci MediaBox/CropBox")
|
|
}
|
|
|
|
func inheritedRect(v pdf.Value, key string) (rect, bool) {
|
|
for cur := v; !cur.IsNull(); cur = cur.Key("Parent") {
|
|
candidate := cur.Key(key)
|
|
r, err := rectFromArray(candidate)
|
|
if err == nil {
|
|
return r, true
|
|
}
|
|
}
|
|
return rect{}, false
|
|
}
|
|
|
|
func rectFromArray(v pdf.Value) (rect, error) {
|
|
if v.IsNull() || v.Len() != 4 {
|
|
return rect{}, errors.New("neplatny rectangle")
|
|
}
|
|
|
|
r := rect{
|
|
llx: v.Index(0).Float64(),
|
|
lly: v.Index(1).Float64(),
|
|
urx: v.Index(2).Float64(),
|
|
ury: v.Index(3).Float64(),
|
|
}.normalize()
|
|
|
|
if !r.valid() {
|
|
return rect{}, errors.New("neplatny rectangle")
|
|
}
|
|
|
|
return r, nil
|
|
}
|
|
|
|
func copyFile(src, dst string) error {
|
|
in, err := os.Open(src)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer in.Close()
|
|
|
|
out, err := os.Create(dst)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = out.ReadFrom(in)
|
|
if closeErr := out.Close(); closeErr != nil && err == nil {
|
|
err = closeErr
|
|
}
|
|
|
|
if err != nil {
|
|
_ = os.Remove(dst)
|
|
}
|
|
return err
|
|
}
|
|
|
|
func trimmedPath(input string) string {
|
|
ext := filepath.Ext(input)
|
|
base := strings.TrimSuffix(input, ext)
|
|
return base + "-trimmed.pdf"
|
|
}
|
|
|
|
func nearlyEqualRect(a, b rect, eps float64) bool {
|
|
return math.Abs(a.llx-b.llx) <= eps &&
|
|
math.Abs(a.lly-b.lly) <= eps &&
|
|
math.Abs(a.urx-b.urx) <= eps &&
|
|
math.Abs(a.ury-b.ury) <= eps
|
|
}
|
|
|
|
func (m matrix) mul(other matrix) matrix {
|
|
var out matrix
|
|
for i := 0; i < 3; i++ {
|
|
for j := 0; j < 3; j++ {
|
|
for k := 0; k < 3; k++ {
|
|
out[i][j] += m[i][k] * other[k][j]
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (r rect) normalize() rect {
|
|
if r.llx > r.urx {
|
|
r.llx, r.urx = r.urx, r.llx
|
|
}
|
|
if r.lly > r.ury {
|
|
r.lly, r.ury = r.ury, r.lly
|
|
}
|
|
return r
|
|
}
|
|
|
|
func (r rect) valid() bool {
|
|
return !(math.IsNaN(r.llx) || math.IsNaN(r.lly) || math.IsNaN(r.urx) || math.IsNaN(r.ury) ||
|
|
math.IsInf(r.llx, 0) || math.IsInf(r.lly, 0) || math.IsInf(r.urx, 0) || math.IsInf(r.ury, 0))
|
|
}
|
|
|
|
func (r rect) width() float64 {
|
|
return r.urx - r.llx
|
|
}
|
|
|
|
func (r rect) height() float64 {
|
|
return r.ury - r.lly
|
|
}
|
|
|
|
func (r rect) union(other rect) rect {
|
|
return rect{
|
|
llx: math.Min(r.llx, other.llx),
|
|
lly: math.Min(r.lly, other.lly),
|
|
urx: math.Max(r.urx, other.urx),
|
|
ury: math.Max(r.ury, other.ury),
|
|
}
|
|
}
|
|
|
|
func (r rect) expand(padding float64) rect {
|
|
return rect{
|
|
llx: r.llx - padding,
|
|
lly: r.lly - padding,
|
|
urx: r.urx + padding,
|
|
ury: r.ury + padding,
|
|
}
|
|
}
|
|
|
|
func (r rect) clamp(limit rect) rect {
|
|
return rect{
|
|
llx: clamp(r.llx, limit.llx, limit.urx),
|
|
lly: clamp(r.lly, limit.lly, limit.ury),
|
|
urx: clamp(r.urx, limit.llx, limit.urx),
|
|
ury: clamp(r.ury, limit.lly, limit.ury),
|
|
}
|
|
}
|
|
|
|
func (r rect) boxDef() string {
|
|
return fmt.Sprintf("[%.4f %.4f %.4f %.4f]", r.llx, r.lly, r.urx, r.ury)
|
|
}
|
|
|
|
func clamp(v, minV, maxV float64) float64 {
|
|
if v < minV {
|
|
return minV
|
|
}
|
|
if v > maxV {
|
|
return maxV
|
|
}
|
|
return v
|
|
}
|