The DOCX tracked-change pipeline
How a plain-text edit from the model becomes surgical Word tracked-change markup that preserves formatting — from the skill the model reads to the OOXML engine and LibreOffice finalization.
libreoffice-service/main.py568 lines · accept_all_tracked_changes L362–370
Outline 13 symbols
- _check_libreoffice function
- HealthResponse class
- health_check function
- convert_document function
- _install_macro function
- _run_macro function
- _apply_tracked_change_op function
- accept_all_tracked_changes function
- reject_all_tracked_changes function
- _install_compare_macro function
- compare_documents function
- root function
- global_exception_handler function
1"""
2LibreOffice Document Preview Service
3
4A lightweight FastAPI service that converts office documents to PDF
5using LibreOffice headless mode. Designed for high-fidelity preview
6of Word, PowerPoint, and Excel files including tracked changes, numbering, and formatting.
7
8Runs as a standalone container alongside DoclingService.
9"""
10
11import os
12import subprocess
13import tempfile
14import logging
15from pathlib import Path
16from urllib.parse import quote
17
18from fastapi import FastAPI, File, UploadFile, HTTPException
19from fastapi.middleware.cors import CORSMiddleware
20from fastapi.responses import Response, JSONResponse
21from pydantic import BaseModel
22
23logging.basicConfig(
24 level=logging.INFO,
25 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
26)
27logger = logging.getLogger(__name__)
28
29LIBREOFFICE_BIN = os.getenv("LIBREOFFICE_BIN", "libreoffice")
30
31
32def _check_libreoffice() -> bool:
33 """Verify LibreOffice is installed and callable."""
34 try:
35 result = subprocess.run(
36 [LIBREOFFICE_BIN, "--version"],
37 capture_output=True,
38 text=True,
39 timeout=10,
40 )
41 return result.returncode == 0
42 except (FileNotFoundError, subprocess.TimeoutExpired):
43 return False
44
45
46app = FastAPI(
47 title="LibreOffice Document Preview Service",
48 description="Converts DOCX to PDF via LibreOffice headless",
49 version="1.0.0",
50)
51
52ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",")
53app.add_middleware(
54 CORSMiddleware,
55 allow_origins=ALLOWED_ORIGINS if ALLOWED_ORIGINS != ["*"] else ["*"],
56 allow_credentials=True,
57 allow_methods=["*"],
58 allow_headers=["*"],
59)
60
61
62class HealthResponse(BaseModel):
63 status: str
64 service: str
65 version: str
66 libreoffice_ready: bool
67
68
69@app.get("/health", response_model=HealthResponse)
70async def health_check():
71 """Health check — confirms LibreOffice binary is available."""
72 ready = _check_libreoffice()
73 return HealthResponse(
74 status="ok" if ready else "degraded",
75 service="libreoffice-preview",
76 version="1.0.0",
77 libreoffice_ready=ready,
78 )
79
80
81SUPPORTED_INPUTS = {".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"}
82OUTPUT_FORMATS = {
83 "pdf": {"ext": ".pdf", "mime": "application/pdf"},
84 "docx": {
85 "ext": ".docx",
86 "mime": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
87 },
88}
89
90
91@app.post("/convert")
92async def convert_document(
93 file: UploadFile = File(...),
94 format: str = "pdf",
95):
96 """
97 Convert an uploaded Word document using LibreOffice headless.
98
99 Query params:
100 format: output format — "pdf" (default) or "docx"
101
102 Returns raw bytes of the converted file.
103 """
104 if not file.filename:
105 raise HTTPException(status_code=400, detail="No filename provided")
106
107 ext = os.path.splitext(file.filename)[1].lower()
108 if ext not in SUPPORTED_INPUTS:
109 raise HTTPException(
110 status_code=400,
111 detail=f"Unsupported file type: {ext}. Supported: {', '.join(SUPPORTED_INPUTS)}",
112 )
113
114 out_fmt = OUTPUT_FORMATS.get(format)
115 if not out_fmt:
116 raise HTTPException(
117 status_code=400,
118 detail=f"Unsupported output format: {format}. Supported: {', '.join(OUTPUT_FORMATS)}",
119 )
120
121 try:
122 content = await file.read()
123 logger.info(f"Converting {file.filename} ({len(content)} bytes) → {format}")
124
125 with tempfile.TemporaryDirectory() as tmpdir:
126 input_path = Path(tmpdir) / f"input{ext}"
127 input_path.write_bytes(content)
128
129 result = subprocess.run(
130 [
131 LIBREOFFICE_BIN,
132 "--headless",
133 "--norestore",
134 "--convert-to",
135 format,
136 "--outdir",
137 tmpdir,
138 str(input_path),
139 ],
140 capture_output=True,
141 text=True,
142 timeout=120,
143 )
144
145 if result.returncode != 0:
146 logger.error(f"LibreOffice failed: {result.stderr}")
147 raise HTTPException(
148 status_code=500,
149 detail=f"LibreOffice conversion failed: {result.stderr[:500]}",
150 )
151
152 output_path = Path(tmpdir) / f"input{out_fmt['ext']}"
153 if not output_path.exists():
154 logger.error(f"Output not produced. stdout={result.stdout}, stderr={result.stderr}")
155 raise HTTPException(
156 status_code=500,
157 detail=f"LibreOffice did not produce a {format} file",
158 )
159
160 out_bytes = output_path.read_bytes()
161 out_name = f"{Path(file.filename).stem}{out_fmt['ext']}"
162 logger.info(f"Conversion OK: {file.filename} -> {len(out_bytes)} bytes {format}")
163
164 # RFC 5987: use filename* for non-ASCII names (Cyrillic, Arabic, etc.)
165 try:
166 out_name.encode('ascii')
167 disposition = f'inline; filename="{out_name}"'
168 except UnicodeEncodeError:
169 encoded = quote(out_name)
170 ascii_fallback = f"document{out_fmt['ext']}"
171 disposition = f'inline; filename="{ascii_fallback}"; filename*=UTF-8\'\'{encoded}'
172
173 return Response(
174 content=out_bytes,
175 media_type=out_fmt["mime"],
176 headers={"Content-Disposition": disposition},
177 )
178
179 except subprocess.TimeoutExpired:
180 logger.error("LibreOffice conversion timed out (120s)")
181 raise HTTPException(status_code=504, detail="Conversion timed out")
182 except HTTPException:
183 raise
184 except Exception as e:
185 logger.error(f"Conversion error: {e}")
186 raise HTTPException(status_code=500, detail=str(e))
187
188
189# ---------------------------------------------------------------------------
190# Tracked-changes accept/reject via LibreOffice UNO macro dispatch.
191#
192# Ports Anthropic's accept_changes.py pattern
193# (github.com/anthropics/skills/blob/main/skills/docx/scripts/accept_changes.py)
194# into a service endpoint so it can be called from the backend without
195# shipping soffice into the Docker sandbox.
196#
197# LibreOffice handles every OOXML edge case we care about — nested changes,
198# complex formatting, paragraph-mark deletions, table cell content, content
199# controls, comment anchors. Our lxml-based accept/reject missed several of
200# these, producing "unreadable content" dialogs in Word.
201# ---------------------------------------------------------------------------
202
203
204LIBREOFFICE_PROFILE = "/tmp/libreoffice_docx_profile"
205MACRO_DIR = f"{LIBREOFFICE_PROFILE}/user/basic/Standard"
206
207
208# Single Basic module with both Accept and Reject subs. LibreOffice seeds the
209# Standard library infrastructure (script.xlb / script.xlc etc.) automatically
210# during first-run init — we only need to drop Module1.xba in place AFTER
211# that init has run. Matches Anthropic's accept_changes.py pattern.
212ACCEPT_CHANGES_MACRO = """<?xml version="1.0" encoding="UTF-8"?>
213<!DOCTYPE script:module PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "module.dtd">
214<script:module xmlns:script="http://openoffice.org/2000/script" script:name="Module1" script:language="StarBasic">
215 Sub AcceptAllTrackedChanges
216 Dim document As Object
217 Dim dispatcher As Object
218 document = ThisComponent.CurrentController.Frame
219 dispatcher = createUnoService("com.sun.star.frame.DispatchHelper")
220 dispatcher.executeDispatch(document, ".uno:AcceptAllTrackedChanges", "", 0, Array())
221 ThisComponent.store()
222 ThisComponent.close(True)
223 End Sub
224
225 Sub RejectAllTrackedChanges
226 Dim document As Object
227 Dim dispatcher As Object
228 document = ThisComponent.CurrentController.Frame
229 dispatcher = createUnoService("com.sun.star.frame.DispatchHelper")
230 dispatcher.executeDispatch(document, ".uno:RejectAllTrackedChanges", "", 0, Array())
231 ThisComponent.store()
232 ThisComponent.close(True)
233 End Sub
234</script:module>"""
235
236
237_MACRO_INSTALLED = False
238
239
240def _install_macro() -> bool:
241 """Idempotently write the macro into the LibreOffice user profile.
242
243 If the profile doesn't exist, seed it first by running soffice with
244 ``--terminate_after_init`` so LibreOffice scaffolds the Standard
245 library structure. Then drop Module1.xba in place. Without the first-
246 run init, a hand-created ``user/basic/Standard/`` isn't registered and
247 the ``vnd.sun.star.script:Standard.Module1.X`` URL silently resolves
248 to nothing.
249 """
250 global _MACRO_INSTALLED
251 if _MACRO_INSTALLED:
252 return True
253 try:
254 macro_dir = Path(MACRO_DIR)
255 if not macro_dir.exists():
256 subprocess.run(
257 [
258 LIBREOFFICE_BIN,
259 "--headless",
260 f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}",
261 "--terminate_after_init",
262 ],
263 capture_output=True,
264 timeout=30,
265 check=False,
266 )
267 macro_dir.mkdir(parents=True, exist_ok=True)
268
269 (macro_dir / "Module1.xba").write_text(ACCEPT_CHANGES_MACRO)
270 _MACRO_INSTALLED = True
271 logger.info(f"Installed tracked-changes macro into {MACRO_DIR}")
272 return True
273 except Exception as e:
274 logger.error(f"Failed to install macro: {e}")
275 return False
276
277
278def _run_macro(input_path: Path, macro_name: str) -> subprocess.CompletedProcess:
279 """Run a UNO macro against a file via LibreOffice headless.
280
281 macro_name: "AcceptAllTrackedChanges" | "RejectAllTrackedChanges".
282 """
283 cmd = [
284 LIBREOFFICE_BIN,
285 "--headless",
286 f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}",
287 "--norestore",
288 f"vnd.sun.star.script:Standard.Module1.{macro_name}?language=Basic&location=application",
289 str(input_path.absolute()),
290 ]
291 return subprocess.run(cmd, capture_output=True, text=True, timeout=120)
292
293
294async def _apply_tracked_change_op(file: UploadFile, macro_name: str) -> Response:
295 """Shared body for /tracked-changes/accept and /tracked-changes/reject."""
296 if not file.filename:
297 raise HTTPException(status_code=400, detail="No filename provided")
298 ext = os.path.splitext(file.filename)[1].lower()
299 if ext != ".docx":
300 raise HTTPException(
301 status_code=400,
302 detail=f"Only .docx supported for tracked-changes ops, got {ext}",
303 )
304
305 if not _install_macro():
306 raise HTTPException(status_code=500, detail="Failed to install LibreOffice macro")
307
308 content = await file.read()
309 logger.info(
310 f"tracked-changes/{macro_name}: {file.filename} ({len(content)} bytes)"
311 )
312
313 with tempfile.TemporaryDirectory() as tmpdir:
314 input_path = Path(tmpdir) / f"input{ext}"
315 input_path.write_bytes(content)
316
317 try:
318 result = _run_macro(input_path, macro_name)
319 except subprocess.TimeoutExpired:
320 raise HTTPException(status_code=504, detail="Macro execution timed out")
321
322 if result.returncode != 0:
323 logger.error(
324 f"Macro {macro_name} failed: stdout={result.stdout[:300]} "
325 f"stderr={result.stderr[:300]}"
326 )
327 raise HTTPException(
328 status_code=500,
329 detail=f"Macro failed: {result.stderr[:500]}",
330 )
331
332 # LibreOffice writes the macro output back to the same file after
333 # ThisComponent.store() + ThisComponent.close().
334 if not input_path.exists():
335 raise HTTPException(
336 status_code=500,
337 detail="Output file missing after macro — macro likely closed without storing",
338 )
339
340 out_bytes = input_path.read_bytes()
341 out_name = f"{Path(file.filename).stem}_{macro_name.lower()}{ext}"
342 logger.info(
343 f"tracked-changes/{macro_name} ok: {file.filename} -> {len(out_bytes)} bytes"
344 )
345
346 try:
347 out_name.encode("ascii")
348 disposition = f'inline; filename="{out_name}"'
349 except UnicodeEncodeError:
350 encoded = quote(out_name)
351 disposition = (
352 f'inline; filename="document{ext}"; filename*=UTF-8\'\'{encoded}'
353 )
354
355 return Response(
356 content=out_bytes,
357 media_type=OUTPUT_FORMATS["docx"]["mime"],
358 headers={"Content-Disposition": disposition},
359 )
360
361
362@app.post("/tracked-changes/accept")
363async def accept_all_tracked_changes(file: UploadFile = File(...)):
364 """Accept all tracked changes in a DOCX. Returns the cleaned DOCX bytes.
365
366 Uses LibreOffice's native ``.uno:AcceptAllTrackedChanges`` dispatch so
367 every OOXML edge case — nested changes, complex formatting, paragraph
368 marks, table cells, content controls, comment anchors — is handled.
369 """
370 return await _apply_tracked_change_op(file, "AcceptAllTrackedChanges")
371
372
373@app.post("/tracked-changes/reject")
374async def reject_all_tracked_changes(file: UploadFile = File(...)):
375 """Reject all tracked changes in a DOCX. Returns a DOCX that matches the
376 original state before any tracked edits."""
377 return await _apply_tracked_change_op(file, "RejectAllTrackedChanges")
378
379
380# ---------------------------------------------------------------------------
381# Document compare via LibreOffice UNO macro dispatch.
382#
383# Generates a redlined DOCX showing the differences between two documents
384# (file1 = before / baseline, file2 = after / revised). LibreOffice's
385# .uno:CompareDocuments dispatcher loads file1 and merges file2's changes
386# in as tracked changes, which the user can then open in Word and accept/
387# reject like any other set of redlines.
388# ---------------------------------------------------------------------------
389
390
391COMPARE_MACRO_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
392<!DOCTYPE script:module PUBLIC "-//OpenOffice.org//DTD OfficeDocument 1.0//EN" "module.dtd">
393<script:module xmlns:script="http://openoffice.org/2000/script" script:name="Module1" script:language="StarBasic">
394 Sub CompareWithSecondFile
395 Dim oDoc As Object
396 Dim oDispatcher As Object
397 Dim args(0) As New com.sun.star.beans.PropertyValue
398 args(0).Name = "URL"
399 args(0).Value = "__SECOND_FILE_URL__"
400 oDoc = ThisComponent.CurrentController.Frame
401 oDispatcher = createUnoService("com.sun.star.frame.DispatchHelper")
402 oDispatcher.executeDispatch(oDoc, ".uno:CompareDocuments", "", 0, args())
403 ThisComponent.store()
404 ThisComponent.close(True)
405 End Sub
406</script:module>"""
407
408
409def _install_compare_macro(second_file_path: Path) -> bool:
410 """Install the compare macro with the second file's URL baked in.
411
412 The macro runs against file1 (passed as ThisComponent on the soffice
413 command line) and dispatches CompareDocuments with file2's URL as
414 the comparison source.
415 """
416 try:
417 # Ensure the profile + Standard library are scaffolded.
418 macro_dir = Path(MACRO_DIR)
419 if not macro_dir.exists():
420 subprocess.run(
421 [
422 LIBREOFFICE_BIN,
423 "--headless",
424 f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}",
425 "--terminate_after_init",
426 ],
427 capture_output=True,
428 timeout=30,
429 check=False,
430 )
431 macro_dir.mkdir(parents=True, exist_ok=True)
432
433 second_url = "file://" + str(second_file_path.absolute())
434 macro_xml = COMPARE_MACRO_TEMPLATE.replace("__SECOND_FILE_URL__", second_url)
435 (macro_dir / "Module1.xba").write_text(macro_xml)
436 # Force reinstall on next accept/reject call so the macro reverts.
437 global _MACRO_INSTALLED
438 _MACRO_INSTALLED = False
439 logger.info(f"Installed compare macro pointing at {second_url}")
440 return True
441 except Exception as e:
442 logger.error(f"Failed to install compare macro: {e}")
443 return False
444
445
446@app.post("/compare")
447async def compare_documents(
448 file1: UploadFile = File(..., description="Baseline document (before)"),
449 file2: UploadFile = File(..., description="Revised document (after)"),
450):
451 """Compare two DOCX files. Returns a redlined DOCX showing file2's
452 differences from file1 as tracked changes.
453
454 Uses LibreOffice's native ``.uno:CompareDocuments`` dispatcher — which
455 handles paragraph-mark migration, run-property tracking, and table-cell
456 edge cases that hand-rolled diff approaches miss.
457 """
458 if not file1.filename or not file2.filename:
459 raise HTTPException(status_code=400, detail="Both file1 and file2 require filenames")
460 for f in (file1, file2):
461 ext = os.path.splitext(f.filename)[1].lower()
462 if ext != ".docx":
463 raise HTTPException(
464 status_code=400,
465 detail=f"Only .docx supported, got {ext} for {f.filename}",
466 )
467
468 content1 = await file1.read()
469 content2 = await file2.read()
470 logger.info(
471 f"compare: file1={file1.filename} ({len(content1)} bytes), "
472 f"file2={file2.filename} ({len(content2)} bytes)"
473 )
474
475 with tempfile.TemporaryDirectory() as tmpdir:
476 path1 = Path(tmpdir) / "before.docx"
477 path2 = Path(tmpdir) / "after.docx"
478 path1.write_bytes(content1)
479 path2.write_bytes(content2)
480
481 if not _install_compare_macro(path2):
482 raise HTTPException(status_code=500, detail="Failed to install compare macro")
483
484 cmd = [
485 LIBREOFFICE_BIN,
486 "--headless",
487 f"-env:UserInstallation=file://{LIBREOFFICE_PROFILE}",
488 "--norestore",
489 "vnd.sun.star.script:Standard.Module1.CompareWithSecondFile?language=Basic&location=application",
490 str(path1.absolute()),
491 ]
492 try:
493 result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
494 except subprocess.TimeoutExpired:
495 raise HTTPException(status_code=504, detail="Compare timed out")
496
497 if result.returncode != 0:
498 logger.error(
499 f"Compare macro failed: stdout={result.stdout[:300]} "
500 f"stderr={result.stderr[:300]}"
501 )
502 raise HTTPException(
503 status_code=500,
504 detail=f"Compare failed: {result.stderr[:500]}",
505 )
506
507 if not path1.exists():
508 raise HTTPException(
509 status_code=500,
510 detail="Output missing after compare macro",
511 )
512
513 out_bytes = path1.read_bytes()
514 out_name = f"{Path(file1.filename).stem}_vs_{Path(file2.filename).stem}_redlined.docx"
515 logger.info(f"compare ok: {out_name} ({len(out_bytes)} bytes)")
516
517 try:
518 out_name.encode("ascii")
519 disposition = f'inline; filename="{out_name}"'
520 except UnicodeEncodeError:
521 encoded = quote(out_name)
522 disposition = (
523 f'inline; filename="redlined.docx"; filename*=UTF-8\'\'{encoded}'
524 )
525
526 return Response(
527 content=out_bytes,
528 media_type=OUTPUT_FORMATS["docx"]["mime"],
529 headers={"Content-Disposition": disposition},
530 )
531
532
533@app.get("/")
534async def root():
535 return {
536 "service": "LibreOffice Document Preview Service",
537 "version": "1.2.0",
538 "endpoints": {
539 "/convert": "POST - Convert document to PDF/DOCX",
540 "/tracked-changes/accept": "POST - Accept all tracked changes, return cleaned DOCX",
541 "/tracked-changes/reject": "POST - Reject all tracked changes, return original-state DOCX",
542 "/compare": "POST - Compare two DOCX files (file1=before, file2=after), return redlined DOCX",
543 "/health": "GET - Health check",
544 },
545 }
546
547
548@app.exception_handler(Exception)
549async def global_exception_handler(request, exc):
550 logger.error(f"Unhandled exception: {exc}")
551 return JSONResponse(
552 status_code=500,
553 content={"error": "Internal server error", "detail": str(exc)},
554 )
555
556
557if __name__ == "__main__":
558 import uvicorn
559
560 port = int(os.getenv("PORT", "8002"))
561 host = os.getenv("HOST", "0.0.0.0")
562 uvicorn.run(
563 "main:app",
564 host=host,
565 port=port,
566 reload=os.getenv("ENV", "development") == "development",
567 )
568