package main import ( "errors" "fmt" "math" "os" "path/filepath" "strconv" "strings" "github.com/pdfcpu/pdfcpu/pkg/api" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" "rsc.io/pdf" ) const ( paddingPoints = 6.0 minSidePoints = 1.0 ) type rect struct { llx float64 lly float64 urx float64 ury float64 } type pageCrop struct { crop rect apply bool } type matrix [3][3]float64 var identity = matrix{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}} func main() { if len(os.Args) < 2 { fmt.Fprintf(os.Stderr, "Pouzitie: %s subor.pdf\n", filepath.Base(os.Args[0])) os.Exit(1) } input := os.Args[1] if strings.ToLower(filepath.Ext(input)) != ".pdf" { fmt.Fprintln(os.Stderr, "Chyba: prvy argument musi byt PDF subor.") os.Exit(1) } if _, err := os.Stat(input); err != nil { fmt.Fprintf(os.Stderr, "Chyba: subor %q neexistuje alebo sa neda citat (%v).\n", input, err) os.Exit(1) } output := trimmedPath(input) if err := trimPDF(input, output); err != nil { fmt.Fprintf(os.Stderr, "Chyba: %v\n", err) os.Exit(1) } fmt.Printf("Hotovo: %s\n", output) } func trimPDF(input, output string) error { reader, err := pdf.Open(input) if err != nil { return fmt.Errorf("nepodarilo sa otvorit PDF: %w", err) } pageCount := reader.NumPage() if pageCount == 0 { return errors.New("PDF nema ziadne strany") } plan := make([]pageCrop, pageCount) for i := 1; i <= pageCount; i++ { p := reader.Page(i) if p.V.IsNull() { continue } pageBox, err := effectivePageBox(p.V) if err != nil { return fmt.Errorf("strana %d: %w", i, err) } contentBox, found := detectContentBox(p) if !found { continue } contentBox = contentBox.expand(paddingPoints).clamp(pageBox) if contentBox.width() < minSidePoints || contentBox.height() < minSidePoints { continue } if nearlyEqualRect(contentBox, pageBox, 0.01) { continue } plan[i-1] = pageCrop{crop: contentBox, apply: true} } if err := copyFile(input, output); err != nil { return fmt.Errorf("nepodarilo sa pripravit vystupny subor: %w", err) } for i, item := range plan { if !item.apply { continue } boxDef := item.crop.boxDef() cropBox, err := model.ParseBox(boxDef, types.POINTS) if err != nil { return fmt.Errorf("strana %d: neplatny crop box %s: %w", i+1, boxDef, err) } pageSel := []string{strconv.Itoa(i + 1)} if err := api.CropFile(output, output, pageSel, cropBox, nil); err != nil { return fmt.Errorf("strana %d: crop zlyhal: %w", i+1, err) } pageBounds, err := api.PageBoundaries("media:"+boxDef, types.POINTS) if err != nil { return fmt.Errorf("strana %d: nepodarilo sa pripravit media box: %w", i+1, err) } if err := api.AddBoxesFile(output, output, pageSel, pageBounds, nil); err != nil { return fmt.Errorf("strana %d: nastavenie media box zlyhalo: %w", i+1, err) } } return nil } func detectContentBox(p pdf.Page) (rect, bool) { var box rect found := false add := func(r rect) { r = r.normalize() if !r.valid() { return } if !found { box = r found = true return } box = box.union(r) } if tb, ok := detectTextBox(p); ok { add(tb) } if gb, ok := detectGraphicsBox(p); ok { add(gb) } return box, found } type textState struct { Tc float64 Tw float64 Th float64 Tl float64 Tf pdf.Font Tfs float64 Trise float64 Tm matrix Tlm matrix CTM matrix } func detectTextBox(p pdf.Page) (rect, bool) { var box rect found := false add := func(r rect) { r = r.normalize() if !r.valid() { return } if !found { box = r found = true return } box = box.union(r) } g := textState{ Th: 1, CTM: identity, Tm: identity, Tlm: identity, } var enc pdf.TextEncoding var gstack []textState showText := func(raw string) { decoded := raw if enc != nil { decoded = enc.Decode(raw) } rawBytes := []byte(raw) i := 0 for _, ch := range decoded { trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM) w0 := 500.0 if i < len(rawBytes) && !g.Tf.V.IsNull() { w0 = g.Tf.Width(int(rawBytes[i])) } if i < len(rawBytes) { i++ } charWidth := math.Abs(w0 / 1000 * trm[0][0]) h := math.Abs(g.Tfs) if h <= 0 { h = 8 } if !unicodeIsSpace(ch) { add(rect{ llx: trm[2][0], lly: trm[2][1] - 0.30*h, urx: trm[2][0] + math.Max(charWidth, 0.2*h), ury: trm[2][1] + 0.90*h, }) } tx := w0/1000*g.Tfs + g.Tc if ch == ' ' { tx += g.Tw } tx *= g.Th g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } } if !interpretPageContent(p, func(stk *pdf.Stack, op string) { args := popArgs(stk) switch op { case "q": gstack = append(gstack, g) case "Q": if len(gstack) == 0 { return } g = gstack[len(gstack)-1] gstack = gstack[:len(gstack)-1] case "cm": m, ok := matrixFromArgs(args) if ok { g.CTM = m.mul(g.CTM) } case "BT": g.Tm = identity g.Tlm = identity case "T*": x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} g.Tlm = x.mul(g.Tlm) g.Tm = g.Tlm case "Tc": if len(args) == 1 { g.Tc = args[0].Float64() } case "TD": if len(args) != 2 { return } g.Tl = -args[1].Float64() fallthrough case "Td": if len(args) != 2 { return } tx := args[0].Float64() ty := args[1].Float64() x := matrix{{1, 0, 0}, {0, 1, 0}, {tx, ty, 1}} g.Tlm = x.mul(g.Tlm) g.Tm = g.Tlm case "Tf": if len(args) != 2 { return } g.Tf = p.Font(args[0].Name()) g.Tfs = args[1].Float64() enc = nil if !g.Tf.V.IsNull() { enc = g.Tf.Encoder() } case "\"": if len(args) != 3 { return } g.Tw = args[0].Float64() g.Tc = args[1].Float64() showText(args[2].RawString()) case "'": if len(args) != 1 { return } x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} g.Tlm = x.mul(g.Tlm) g.Tm = g.Tlm showText(args[0].RawString()) case "Tj": if len(args) == 1 { showText(args[0].RawString()) } case "TJ": if len(args) != 1 { return } v := args[0] for i := 0; i < v.Len(); i++ { x := v.Index(i) if x.Kind() == pdf.String { showText(x.RawString()) continue } tx := -x.Float64() / 1000 * g.Tfs * g.Th g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } case "TL": if len(args) == 1 { g.Tl = args[0].Float64() } case "Tm": m, ok := matrixFromArgs(args) if ok { g.Tm = m g.Tlm = m } case "Ts": if len(args) == 1 { g.Trise = args[0].Float64() } case "Tw": if len(args) == 1 { g.Tw = args[0].Float64() } case "Tz": if len(args) == 1 { g.Th = args[0].Float64() / 100 } } }) { return rect{}, false } return box, found } func detectGraphicsBox(p pdf.Page) (rect, bool) { var box rect found := false add := func(r rect) { r = r.normalize() if !r.valid() { return } if !found { box = r found = true return } box = box.union(r) } xObjects := p.Resources().Key("XObject") ctm := identity var ctmStack []matrix if !interpretPageContent(p, func(stk *pdf.Stack, op string) { args := popArgs(stk) switch op { case "q": ctmStack = append(ctmStack, ctm) case "Q": if len(ctmStack) == 0 { return } ctm = ctmStack[len(ctmStack)-1] ctmStack = ctmStack[:len(ctmStack)-1] case "cm": m, ok := matrixFromArgs(args) if ok { ctm = m.mul(ctm) } case "re": if len(args) != 4 { return } x := args[0].Float64() y := args[1].Float64() w := args[2].Float64() h := args[3].Float64() add(transformedRect(ctm, x, y, x+w, y+h)) case "Do": if len(args) != 1 { return } name := args[0].Name() if name == "" || xObjects.IsNull() { return } xobj := xObjects.Key(name) subtype := xobj.Key("Subtype").Name() if subtype == "Image" || subtype == "Form" { add(transformedRect(ctm, 0, 0, 1, 1)) } } }) { return rect{}, false } return box, found } func interpretPageContent(p pdf.Page, fn func(stk *pdf.Stack, op string)) bool { contents := p.V.Key("Contents") if contents.IsNull() { return false } ok := false run := func(stream pdf.Value) { if safeInterpretStream(stream, fn) { ok = true } } switch contents.Kind() { case pdf.Stream: run(contents) case pdf.Array: for i := 0; i < contents.Len(); i++ { stream := contents.Index(i) if stream.IsNull() || stream.Kind() != pdf.Stream { continue } run(stream) } } return ok } func safeInterpretStream(stream pdf.Value, fn func(stk *pdf.Stack, op string)) (ok bool) { if stream.IsNull() || stream.Kind() != pdf.Stream { return false } defer func() { if recover() != nil { ok = false } }() pdf.Interpret(stream, fn) return true } func unicodeIsSpace(r rune) bool { if r == ' ' || r == '\t' || r == '\n' || r == '\r' { return true } return false } func popArgs(stk *pdf.Stack) []pdf.Value { n := stk.Len() if n == 0 { return nil } args := make([]pdf.Value, n) for i := n - 1; i >= 0; i-- { args[i] = stk.Pop() } return args } func matrixFromArgs(args []pdf.Value) (matrix, bool) { if len(args) != 6 { return matrix{}, false } var m matrix for i := 0; i < 6; i++ { m[i/2][i%2] = args[i].Float64() } m[2][2] = 1 return m, true } func transformedRect(m matrix, x0, y0, x1, y1 float64) rect { p1x, p1y := transformPoint(m, x0, y0) p2x, p2y := transformPoint(m, x1, y0) p3x, p3y := transformPoint(m, x0, y1) p4x, p4y := transformPoint(m, x1, y1) minX := math.Min(math.Min(p1x, p2x), math.Min(p3x, p4x)) minY := math.Min(math.Min(p1y, p2y), math.Min(p3y, p4y)) maxX := math.Max(math.Max(p1x, p2x), math.Max(p3x, p4x)) maxY := math.Max(math.Max(p1y, p2y), math.Max(p3y, p4y)) return rect{llx: minX, lly: minY, urx: maxX, ury: maxY} } func transformPoint(m matrix, x, y float64) (float64, float64) { px := x*m[0][0] + y*m[1][0] + m[2][0] py := x*m[0][1] + y*m[1][1] + m[2][1] return px, py } func effectivePageBox(page pdf.Value) (rect, error) { if r, ok := inheritedRect(page, "CropBox"); ok { return r, nil } if r, ok := inheritedRect(page, "MediaBox"); ok { return r, nil } return rect{}, errors.New("chybajuci MediaBox/CropBox") } func inheritedRect(v pdf.Value, key string) (rect, bool) { for cur := v; !cur.IsNull(); cur = cur.Key("Parent") { candidate := cur.Key(key) r, err := rectFromArray(candidate) if err == nil { return r, true } } return rect{}, false } func rectFromArray(v pdf.Value) (rect, error) { if v.IsNull() || v.Len() != 4 { return rect{}, errors.New("neplatny rectangle") } r := rect{ llx: v.Index(0).Float64(), lly: v.Index(1).Float64(), urx: v.Index(2).Float64(), ury: v.Index(3).Float64(), }.normalize() if !r.valid() { return rect{}, errors.New("neplatny rectangle") } return r, nil } func copyFile(src, dst string) error { in, err := os.Open(src) if err != nil { return err } defer in.Close() out, err := os.Create(dst) if err != nil { return err } _, err = out.ReadFrom(in) if closeErr := out.Close(); closeErr != nil && err == nil { err = closeErr } if err != nil { _ = os.Remove(dst) } return err } func trimmedPath(input string) string { ext := filepath.Ext(input) base := strings.TrimSuffix(input, ext) return base + "-trimmed.pdf" } func nearlyEqualRect(a, b rect, eps float64) bool { return math.Abs(a.llx-b.llx) <= eps && math.Abs(a.lly-b.lly) <= eps && math.Abs(a.urx-b.urx) <= eps && math.Abs(a.ury-b.ury) <= eps } func (m matrix) mul(other matrix) matrix { var out matrix for i := 0; i < 3; i++ { for j := 0; j < 3; j++ { for k := 0; k < 3; k++ { out[i][j] += m[i][k] * other[k][j] } } } return out } func (r rect) normalize() rect { if r.llx > r.urx { r.llx, r.urx = r.urx, r.llx } if r.lly > r.ury { r.lly, r.ury = r.ury, r.lly } return r } func (r rect) valid() bool { return !(math.IsNaN(r.llx) || math.IsNaN(r.lly) || math.IsNaN(r.urx) || math.IsNaN(r.ury) || math.IsInf(r.llx, 0) || math.IsInf(r.lly, 0) || math.IsInf(r.urx, 0) || math.IsInf(r.ury, 0)) } func (r rect) width() float64 { return r.urx - r.llx } func (r rect) height() float64 { return r.ury - r.lly } func (r rect) union(other rect) rect { return rect{ llx: math.Min(r.llx, other.llx), lly: math.Min(r.lly, other.lly), urx: math.Max(r.urx, other.urx), ury: math.Max(r.ury, other.ury), } } func (r rect) expand(padding float64) rect { return rect{ llx: r.llx - padding, lly: r.lly - padding, urx: r.urx + padding, ury: r.ury + padding, } } func (r rect) clamp(limit rect) rect { return rect{ llx: clamp(r.llx, limit.llx, limit.urx), lly: clamp(r.lly, limit.lly, limit.ury), urx: clamp(r.urx, limit.llx, limit.urx), ury: clamp(r.ury, limit.lly, limit.ury), } } func (r rect) boxDef() string { return fmt.Sprintf("[%.4f %.4f %.4f %.4f]", r.llx, r.lly, r.urx, r.ury) } func clamp(v, minV, maxV float64) float64 { if v < minV { return minV } if v > maxV { return maxV } return v }