Files
PDFtrim/main.go

691 lines
13 KiB
Go

package main
import (
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
"rsc.io/pdf"
)
const (
paddingPoints = 6.0
minSidePoints = 1.0
)
type rect struct {
llx float64
lly float64
urx float64
ury float64
}
type pageCrop struct {
crop rect
apply bool
}
type matrix [3][3]float64
var identity = matrix{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}
func main() {
if len(os.Args) < 2 {
fmt.Fprintf(os.Stderr, "Pouzitie: %s subor.pdf\n", filepath.Base(os.Args[0]))
os.Exit(1)
}
input := os.Args[1]
if strings.ToLower(filepath.Ext(input)) != ".pdf" {
fmt.Fprintln(os.Stderr, "Chyba: prvy argument musi byt PDF subor.")
os.Exit(1)
}
if _, err := os.Stat(input); err != nil {
fmt.Fprintf(os.Stderr, "Chyba: subor %q neexistuje alebo sa neda citat (%v).\n", input, err)
os.Exit(1)
}
output := trimmedPath(input)
if err := trimPDF(input, output); err != nil {
fmt.Fprintf(os.Stderr, "Chyba: %v\n", err)
os.Exit(1)
}
fmt.Printf("Hotovo: %s\n", output)
}
func trimPDF(input, output string) error {
reader, err := pdf.Open(input)
if err != nil {
return fmt.Errorf("nepodarilo sa otvorit PDF: %w", err)
}
pageCount := reader.NumPage()
if pageCount == 0 {
return errors.New("PDF nema ziadne strany")
}
plan := make([]pageCrop, pageCount)
for i := 1; i <= pageCount; i++ {
p := reader.Page(i)
if p.V.IsNull() {
continue
}
pageBox, err := effectivePageBox(p.V)
if err != nil {
return fmt.Errorf("strana %d: %w", i, err)
}
contentBox, found := detectContentBox(p)
if !found {
continue
}
contentBox = contentBox.expand(paddingPoints).clamp(pageBox)
if contentBox.width() < minSidePoints || contentBox.height() < minSidePoints {
continue
}
if nearlyEqualRect(contentBox, pageBox, 0.01) {
continue
}
plan[i-1] = pageCrop{crop: contentBox, apply: true}
}
if err := copyFile(input, output); err != nil {
return fmt.Errorf("nepodarilo sa pripravit vystupny subor: %w", err)
}
for i, item := range plan {
if !item.apply {
continue
}
boxDef := item.crop.boxDef()
cropBox, err := model.ParseBox(boxDef, types.POINTS)
if err != nil {
return fmt.Errorf("strana %d: neplatny crop box %s: %w", i+1, boxDef, err)
}
pageSel := []string{strconv.Itoa(i + 1)}
if err := api.CropFile(output, output, pageSel, cropBox, nil); err != nil {
return fmt.Errorf("strana %d: crop zlyhal: %w", i+1, err)
}
pageBounds, err := api.PageBoundaries("media:"+boxDef, types.POINTS)
if err != nil {
return fmt.Errorf("strana %d: nepodarilo sa pripravit media box: %w", i+1, err)
}
if err := api.AddBoxesFile(output, output, pageSel, pageBounds, nil); err != nil {
return fmt.Errorf("strana %d: nastavenie media box zlyhalo: %w", i+1, err)
}
}
return nil
}
func detectContentBox(p pdf.Page) (rect, bool) {
var box rect
found := false
add := func(r rect) {
r = r.normalize()
if !r.valid() {
return
}
if !found {
box = r
found = true
return
}
box = box.union(r)
}
if tb, ok := detectTextBox(p); ok {
add(tb)
}
if gb, ok := detectGraphicsBox(p); ok {
add(gb)
}
return box, found
}
type textState struct {
Tc float64
Tw float64
Th float64
Tl float64
Tf pdf.Font
Tfs float64
Trise float64
Tm matrix
Tlm matrix
CTM matrix
}
func detectTextBox(p pdf.Page) (rect, bool) {
var box rect
found := false
add := func(r rect) {
r = r.normalize()
if !r.valid() {
return
}
if !found {
box = r
found = true
return
}
box = box.union(r)
}
g := textState{
Th: 1,
CTM: identity,
Tm: identity,
Tlm: identity,
}
var enc pdf.TextEncoding
var gstack []textState
showText := func(raw string) {
decoded := raw
if enc != nil {
decoded = enc.Decode(raw)
}
rawBytes := []byte(raw)
i := 0
for _, ch := range decoded {
trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM)
w0 := 500.0
if i < len(rawBytes) && !g.Tf.V.IsNull() {
w0 = g.Tf.Width(int(rawBytes[i]))
}
if i < len(rawBytes) {
i++
}
charWidth := math.Abs(w0 / 1000 * trm[0][0])
h := math.Abs(g.Tfs)
if h <= 0 {
h = 8
}
if !unicodeIsSpace(ch) {
add(rect{
llx: trm[2][0],
lly: trm[2][1] - 0.30*h,
urx: trm[2][0] + math.Max(charWidth, 0.2*h),
ury: trm[2][1] + 0.90*h,
})
}
tx := w0/1000*g.Tfs + g.Tc
if ch == ' ' {
tx += g.Tw
}
tx *= g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
}
if !interpretPageContent(p, func(stk *pdf.Stack, op string) {
args := popArgs(stk)
switch op {
case "q":
gstack = append(gstack, g)
case "Q":
if len(gstack) == 0 {
return
}
g = gstack[len(gstack)-1]
gstack = gstack[:len(gstack)-1]
case "cm":
m, ok := matrixFromArgs(args)
if ok {
g.CTM = m.mul(g.CTM)
}
case "BT":
g.Tm = identity
g.Tlm = identity
case "T*":
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
case "Tc":
if len(args) == 1 {
g.Tc = args[0].Float64()
}
case "TD":
if len(args) != 2 {
return
}
g.Tl = -args[1].Float64()
fallthrough
case "Td":
if len(args) != 2 {
return
}
tx := args[0].Float64()
ty := args[1].Float64()
x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
case "Tf":
if len(args) != 2 {
return
}
g.Tf = p.Font(args[0].Name())
g.Tfs = args[1].Float64()
enc = nil
if !g.Tf.V.IsNull() {
enc = g.Tf.Encoder()
}
case "\"":
if len(args) != 3 {
return
}
g.Tw = args[0].Float64()
g.Tc = args[1].Float64()
showText(args[2].RawString())
case "'":
if len(args) != 1 {
return
}
x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}}
g.Tlm = x.mul(g.Tlm)
g.Tm = g.Tlm
showText(args[0].RawString())
case "Tj":
if len(args) == 1 {
showText(args[0].RawString())
}
case "TJ":
if len(args) != 1 {
return
}
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == pdf.String {
showText(x.RawString())
continue
}
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
case "TL":
if len(args) == 1 {
g.Tl = args[0].Float64()
}
case "Tm":
m, ok := matrixFromArgs(args)
if ok {
g.Tm = m
g.Tlm = m
}
case "Ts":
if len(args) == 1 {
g.Trise = args[0].Float64()
}
case "Tw":
if len(args) == 1 {
g.Tw = args[0].Float64()
}
case "Tz":
if len(args) == 1 {
g.Th = args[0].Float64() / 100
}
}
}) {
return rect{}, false
}
return box, found
}
func detectGraphicsBox(p pdf.Page) (rect, bool) {
var box rect
found := false
add := func(r rect) {
r = r.normalize()
if !r.valid() {
return
}
if !found {
box = r
found = true
return
}
box = box.union(r)
}
xObjects := p.Resources().Key("XObject")
ctm := identity
var ctmStack []matrix
if !interpretPageContent(p, func(stk *pdf.Stack, op string) {
args := popArgs(stk)
switch op {
case "q":
ctmStack = append(ctmStack, ctm)
case "Q":
if len(ctmStack) == 0 {
return
}
ctm = ctmStack[len(ctmStack)-1]
ctmStack = ctmStack[:len(ctmStack)-1]
case "cm":
m, ok := matrixFromArgs(args)
if ok {
ctm = m.mul(ctm)
}
case "re":
if len(args) != 4 {
return
}
x := args[0].Float64()
y := args[1].Float64()
w := args[2].Float64()
h := args[3].Float64()
add(transformedRect(ctm, x, y, x+w, y+h))
case "Do":
if len(args) != 1 {
return
}
name := args[0].Name()
if name == "" || xObjects.IsNull() {
return
}
xobj := xObjects.Key(name)
subtype := xobj.Key("Subtype").Name()
if subtype == "Image" || subtype == "Form" {
add(transformedRect(ctm, 0, 0, 1, 1))
}
}
}) {
return rect{}, false
}
return box, found
}
func interpretPageContent(p pdf.Page, fn func(stk *pdf.Stack, op string)) bool {
contents := p.V.Key("Contents")
if contents.IsNull() {
return false
}
ok := false
run := func(stream pdf.Value) {
if safeInterpretStream(stream, fn) {
ok = true
}
}
switch contents.Kind() {
case pdf.Stream:
run(contents)
case pdf.Array:
for i := 0; i < contents.Len(); i++ {
stream := contents.Index(i)
if stream.IsNull() || stream.Kind() != pdf.Stream {
continue
}
run(stream)
}
}
return ok
}
func safeInterpretStream(stream pdf.Value, fn func(stk *pdf.Stack, op string)) (ok bool) {
if stream.IsNull() || stream.Kind() != pdf.Stream {
return false
}
defer func() {
if recover() != nil {
ok = false
}
}()
pdf.Interpret(stream, fn)
return true
}
func unicodeIsSpace(r rune) bool {
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
return true
}
return false
}
func popArgs(stk *pdf.Stack) []pdf.Value {
n := stk.Len()
if n == 0 {
return nil
}
args := make([]pdf.Value, n)
for i := n - 1; i >= 0; i-- {
args[i] = stk.Pop()
}
return args
}
func matrixFromArgs(args []pdf.Value) (matrix, bool) {
if len(args) != 6 {
return matrix{}, false
}
var m matrix
for i := 0; i < 6; i++ {
m[i/2][i%2] = args[i].Float64()
}
m[2][2] = 1
return m, true
}
func transformedRect(m matrix, x0, y0, x1, y1 float64) rect {
p1x, p1y := transformPoint(m, x0, y0)
p2x, p2y := transformPoint(m, x1, y0)
p3x, p3y := transformPoint(m, x0, y1)
p4x, p4y := transformPoint(m, x1, y1)
minX := math.Min(math.Min(p1x, p2x), math.Min(p3x, p4x))
minY := math.Min(math.Min(p1y, p2y), math.Min(p3y, p4y))
maxX := math.Max(math.Max(p1x, p2x), math.Max(p3x, p4x))
maxY := math.Max(math.Max(p1y, p2y), math.Max(p3y, p4y))
return rect{llx: minX, lly: minY, urx: maxX, ury: maxY}
}
func transformPoint(m matrix, x, y float64) (float64, float64) {
px := x*m[0][0] + y*m[1][0] + m[2][0]
py := x*m[0][1] + y*m[1][1] + m[2][1]
return px, py
}
func effectivePageBox(page pdf.Value) (rect, error) {
if r, ok := inheritedRect(page, "CropBox"); ok {
return r, nil
}
if r, ok := inheritedRect(page, "MediaBox"); ok {
return r, nil
}
return rect{}, errors.New("chybajuci MediaBox/CropBox")
}
func inheritedRect(v pdf.Value, key string) (rect, bool) {
for cur := v; !cur.IsNull(); cur = cur.Key("Parent") {
candidate := cur.Key(key)
r, err := rectFromArray(candidate)
if err == nil {
return r, true
}
}
return rect{}, false
}
func rectFromArray(v pdf.Value) (rect, error) {
if v.IsNull() || v.Len() != 4 {
return rect{}, errors.New("neplatny rectangle")
}
r := rect{
llx: v.Index(0).Float64(),
lly: v.Index(1).Float64(),
urx: v.Index(2).Float64(),
ury: v.Index(3).Float64(),
}.normalize()
if !r.valid() {
return rect{}, errors.New("neplatny rectangle")
}
return r, nil
}
func copyFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
_, err = out.ReadFrom(in)
if closeErr := out.Close(); closeErr != nil && err == nil {
err = closeErr
}
if err != nil {
_ = os.Remove(dst)
}
return err
}
func trimmedPath(input string) string {
ext := filepath.Ext(input)
base := strings.TrimSuffix(input, ext)
return base + "-trimmed.pdf"
}
func nearlyEqualRect(a, b rect, eps float64) bool {
return math.Abs(a.llx-b.llx) <= eps &&
math.Abs(a.lly-b.lly) <= eps &&
math.Abs(a.urx-b.urx) <= eps &&
math.Abs(a.ury-b.ury) <= eps
}
func (m matrix) mul(other matrix) matrix {
var out matrix
for i := 0; i < 3; i++ {
for j := 0; j < 3; j++ {
for k := 0; k < 3; k++ {
out[i][j] += m[i][k] * other[k][j]
}
}
}
return out
}
func (r rect) normalize() rect {
if r.llx > r.urx {
r.llx, r.urx = r.urx, r.llx
}
if r.lly > r.ury {
r.lly, r.ury = r.ury, r.lly
}
return r
}
func (r rect) valid() bool {
return !(math.IsNaN(r.llx) || math.IsNaN(r.lly) || math.IsNaN(r.urx) || math.IsNaN(r.ury) ||
math.IsInf(r.llx, 0) || math.IsInf(r.lly, 0) || math.IsInf(r.urx, 0) || math.IsInf(r.ury, 0))
}
func (r rect) width() float64 {
return r.urx - r.llx
}
func (r rect) height() float64 {
return r.ury - r.lly
}
func (r rect) union(other rect) rect {
return rect{
llx: math.Min(r.llx, other.llx),
lly: math.Min(r.lly, other.lly),
urx: math.Max(r.urx, other.urx),
ury: math.Max(r.ury, other.ury),
}
}
func (r rect) expand(padding float64) rect {
return rect{
llx: r.llx - padding,
lly: r.lly - padding,
urx: r.urx + padding,
ury: r.ury + padding,
}
}
func (r rect) clamp(limit rect) rect {
return rect{
llx: clamp(r.llx, limit.llx, limit.urx),
lly: clamp(r.lly, limit.lly, limit.ury),
urx: clamp(r.urx, limit.llx, limit.urx),
ury: clamp(r.ury, limit.lly, limit.ury),
}
}
func (r rect) boxDef() string {
return fmt.Sprintf("[%.4f %.4f %.4f %.4f]", r.llx, r.lly, r.urx, r.ury)
}
func clamp(v, minV, maxV float64) float64 {
if v < minV {
return minV
}
if v > maxV {
return maxV
}
return v
}