The Atlas AnyLegal OSS — documentation bound to its code
20 documents

The DOCX tracked-change pipeline

How a plain-text edit from the model becomes surgical Word tracked-change markup that preserves formatting — from the skill the model reads to the OOXML engine and LibreOffice finalization.

backend/anylegal_oss/workspace/docx_xml_service.py2455 lines · apply_text_edit L535–652
Outline 75 symbols
1"""
2DOCX XML Service — In-memory unpack / merge-runs / validate / repack / edit.
3
4Adapts the Anthropic DOCX Skill workflow (unpack → edit XML → pack) for
5server-side agentic editing. Instead of writing to disk, every operation
6works on bytes/strings in memory.
7
8Public API
9----------
10extract_document_xml(blob) → str
11 Extract word/document.xml, pretty-print, merge runs, strip RSIDs.
12 Returns the cleaned XML string for LLM consumption.
13
14repack_docx(original_blob, new_document_xml) → bytes
15 Replace word/document.xml inside the DOCX ZIP and return new blob.
16
17validate_document_xml(xml_str) → list[str]
18 Quick structural checks (well-formed, w:del uses w:delText, etc.).
19
20apply_text_edit(xml_str, old_text, new_text, author) → (str, dict) | (None, dict)
21 Find plain text in <w:t> elements and apply tracked-change markup.
22 The LLM sends plain text; this function generates OOXML w:del/w:ins.
23"""
24
25import io
26import logging
27import re
28import zipfile
29from datetime import datetime, timezone
30from defusedxml.minidom import parseString as safe_parseString
31from xml.dom import minidom
32
33logger = logging.getLogger(__name__)
34
35SMART_QUOTE_REPLACEMENTS = {
36 "\u201c": "&#x201C;",
37 "\u201d": "&#x201D;",
38 "\u2018": "&#x2018;",
39 "\u2019": "&#x2019;",
40}
41
42def extract_document_xml(blob: bytes) -> str:
43 """
44 Extract ``word/document.xml`` from a DOCX blob, clean it up, and return
45 a pretty-printed XML string suitable for LLM editing.
46
47 Processing steps (mirrors Anthropic DOCX Skill ``unpack.py``):
48 1. Extract document.xml from ZIP
49 2. Pretty-print for readability
50 3. Remove ``<w:proofErr>`` elements (spell-check noise)
51 4. Strip ``rsid`` attributes from runs (revision-save IDs — noise)
52 5. Merge adjacent ``<w:r>`` elements with identical formatting
53 6. Escape smart quotes to XML entities
54 """
55 raw_xml = _read_zip_entry(blob, "word/document.xml")
56
57 dom = safe_parseString(raw_xml)
58 root = dom.documentElement
59
60 _remove_elements(root, "proofErr")
61 _strip_run_rsid_attrs(root)
62
63 containers = {run.parentNode for run in _find_elements(root, "r")}
64 merge_count = 0
65 for container in containers:
66 merge_count += _merge_runs_in(container)
67
68 xml_str = dom.toprettyxml(indent=" ", encoding="utf-8").decode("utf-8")
69
70 for char, entity in SMART_QUOTE_REPLACEMENTS.items():
71 xml_str = xml_str.replace(char, entity)
72
73 xml_str = re.sub(r'\n\s*\n', '\n', xml_str)
74
75 logger.info(f"[DOCX-XML] Extracted document.xml: merged {merge_count} runs")
76 return xml_str
77
78def repack_docx(original_blob: bytes, new_document_xml: str) -> bytes:
79 """
80 Replace ``word/document.xml`` inside the DOCX ZIP with *new_document_xml*
81 and return the updated DOCX blob.
82
83 All other ZIP entries (styles, media, rels, etc.) are preserved unchanged.
84 The XML is condensed (whitespace between tags removed) before packing to
85 keep the file size reasonable.
86 """
87 condensed_xml = _condense_xml(new_document_xml)
88
89 output = io.BytesIO()
90 with zipfile.ZipFile(io.BytesIO(original_blob), "r") as zin:
91 with zipfile.ZipFile(output, "w", zipfile.ZIP_DEFLATED) as zout:
92 for item in zin.infolist():
93 if item.filename == "word/document.xml":
94 zout.writestr(item, condensed_xml.encode("utf-8"))
95 else:
96 zout.writestr(item, zin.read(item.filename))
97
98 return output.getvalue()
99
100def validate_document_xml(xml_str: str) -> list:
101 """
102 Quick structural validation of document.xml after LLM editing.
103
104 Checks:
105 1. XML is well-formed (parses without errors)
106 2. ``<w:del>`` contains ``<w:delText>`` not ``<w:t>``
107 3. ``<w:ins>`` does not contain ``<w:delText>`` (unless nested in ``<w:del>``)
108 4. ``<w:t>`` with leading/trailing whitespace has ``xml:space="preserve"``
109
110 Returns list of error strings (empty = valid).
111 """
112 errors = []
113
114 try:
115 dom = safe_parseString(xml_str.encode("utf-8"))
116 except Exception as e:
117 return [f"XML parse error: {e}"]
118
119 root = dom.documentElement
120
121 for del_elem in _find_elements(root, "del"):
122 for t_elem in _find_elements(del_elem, "t"):
123
124 parent = t_elem.parentNode
125 inside_ins = False
126 while parent and parent != del_elem:
127 name = parent.localName or parent.tagName
128 if name == "ins" or name.endswith(":ins"):
129 inside_ins = True
130 break
131 parent = parent.parentNode
132 if not inside_ins:
133 text = _get_text_content(t_elem)[:50]
134 errors.append(
135 f"<w:t> inside <w:del> (should be <w:delText>): '{text}'"
136 )
137
138 for ins_elem in _find_elements(root, "ins"):
139 for dt_elem in _find_elements(ins_elem, "delText"):
140 parent = dt_elem.parentNode
141 inside_del = False
142 while parent and parent != ins_elem:
143 name = parent.localName or parent.tagName
144 if name == "del" or name.endswith(":del"):
145 inside_del = True
146 break
147 parent = parent.parentNode
148 if not inside_del:
149 text = _get_text_content(dt_elem)[:50]
150 errors.append(
151 f"<w:delText> inside <w:ins> without <w:del>: '{text}'"
152 )
153
154 for t_elem in _find_elements(root, "t"):
155 text = _get_text_content(t_elem)
156 if text and (text[0] in ' \t' or text[-1] in ' \t'):
157 if t_elem.getAttribute("xml:space") != "preserve":
158 errors.append(
159 f"<w:t> with whitespace missing xml:space='preserve': "
160 f"'{text[:30]}'"
161 )
162
163 return errors
164
165def extract_plain_text(blob: bytes) -> str:
166 """
167 Extract plain text from a DOCX blob for LLM analysis.
168
169 Returns paragraph text joined by double newlines. Tables are rendered
170 with ``|`` cell separators so the LLM can see the structure and avoid
171 trying to match text that spans across table cells.
172 """
173 raw_xml = _read_zip_entry(blob, "word/document.xml")
174 dom = safe_parseString(raw_xml)
175 root = dom.documentElement
176
177 def _para_text(p_elem) -> str:
178 """Extract text from a single <w:p>, skipping deleted content."""
179 texts: list = []
180 for r_elem in _find_elements(p_elem, "r"):
181 if _is_inside_del_dom(r_elem):
182 continue
183 for child in r_elem.childNodes:
184 if child.nodeType != child.ELEMENT_NODE:
185 continue
186 tag = child.localName or child.tagName or ""
187 if tag == "t" or tag.endswith(":t"):
188 text = _get_text_content(child)
189 if text:
190 texts.append(text)
191 elif tag == "br" or tag.endswith(":br"):
192 texts.append("\n")
193 return "".join(texts)
194
195 def _table_text(tbl_elem) -> str:
196 """Render a <w:tbl> as pipe-delimited rows."""
197 rows: list = []
198 for tr_elem in _find_elements(tbl_elem, "tr"):
199 cells: list = []
200 for tc_elem in _find_elements(tr_elem, "tc"):
201
202 cell_paras = [
203 _para_text(p) for p in _find_elements(tc_elem, "p")
204 ]
205 cell_text = " ".join(t for t in cell_paras if t.strip())
206 cells.append(cell_text.strip())
207 if any(cells):
208 rows.append("| " + " | ".join(cells) + " |")
209 return "\n".join(rows)
210
211 body = None
212 for child in root.childNodes:
213 if child.nodeType != child.ELEMENT_NODE:
214 continue
215 tag = child.localName or child.tagName or ""
216 if tag == "body" or tag.endswith(":body"):
217 body = child
218 break
219 if body is None:
220 body = root
221
222 blocks: list = []
223 for child in body.childNodes:
224 if child.nodeType != child.ELEMENT_NODE:
225 continue
226 tag = child.localName or child.tagName or ""
227 if tag == "p" or tag.endswith(":p"):
228 pt = _para_text(child)
229 if pt.strip():
230 blocks.append(pt)
231 elif tag == "tbl" or tag.endswith(":tbl"):
232 tt = _table_text(child)
233 if tt.strip():
234 blocks.append(tt)
235
236 return "\n\n".join(blocks)
237
238def revert_tracked_changes(
239 xml_str: str,
240 revision_ids: list,
241) -> tuple:
242 """
243 Surgically remove specific tracked changes from DOCX XML by revision ID.
244
245 For each revision ID:
246 - ``<w:del w:id="ID">`` → **unwrap**: convert ``<w:delText>`` to ``<w:t>``,
247 remove the ``<w:del>`` wrapper, keep the ``<w:r>`` children as normal text.
248 - ``<w:ins w:id="ID">`` → **remove entirely** (the inserted text disappears).
249
250 After removal, empty paragraphs (``<w:p>`` with only ``<w:pPr>`` and no runs)
251 are cleaned up — these arise from cross-paragraph edits that added extra
252 INS-only paragraphs.
253
254 Parameters
255 ----------
256 xml_str : str
257 Current document.xml content (may contain multiple edits).
258 revision_ids : list of int
259 Exact revision IDs from the ``edit_document`` response.
260
261 Returns
262 -------
263 (new_xml, info_dict)
264 info_dict contains ``reverted_ids``, ``not_found_ids``.
265 """
266 if not revision_ids:
267 return xml_str, {"error": "No revision IDs provided"}
268
269 id_set = set(int(rid) for rid in revision_ids)
270 reverted = []
271 not_found = []
272 result = xml_str
273
274 for rid in sorted(id_set):
275 rid_str = str(rid)
276
277 ins_pattern = re.compile(
278 rf'<w:ins\b[^>]*w:id="{rid_str}"[^>]*>.*?</w:ins>',
279 re.DOTALL,
280 )
281 ins_match = ins_pattern.search(result)
282 if ins_match:
283 result = result[:ins_match.start()] + result[ins_match.end():]
284 reverted.append(rid)
285
286 del_pattern = re.compile(
287 rf'<w:del\b[^>]*w:id="{rid_str}"[^>]*>(.*?)</w:del>',
288 re.DOTALL,
289 )
290 del_match = del_pattern.search(result)
291 if del_match:
292 inner = del_match.group(1)
293
294 restored = re.sub(r'<w:delText\b', '<w:t', inner)
295 restored = restored.replace('</w:delText>', '</w:t>')
296 result = result[:del_match.start()] + restored + result[del_match.end():]
297 if rid not in reverted:
298 reverted.append(rid)
299
300 if rid not in reverted:
301 not_found.append(rid)
302
303 result = _remove_empty_paragraphs(result)
304
305 info = {"reverted_ids": reverted, "not_found_ids": not_found}
306 return result, info
307
308def accept_specific_changes(
309 xml_str: str,
310 revision_ids: list,
311) -> tuple:
312 """
313 Accept specific tracked changes by revision ID.
314
315 For each revision ID:
316 - ``<w:ins w:id="ID">`` → **unwrap**: keep the ``<w:r>`` children as
317 plain text (the insertion is now permanent).
318 - ``<w:del w:id="ID">`` → **remove entirely** (the deleted text
319 stays gone).
320
321 After removal, empty paragraphs (those left containing only
322 ``<w:pPr>``) are cleaned up.
323
324 Mirror of ``revert_tracked_changes`` — same shape, opposite intent.
325 Use this when the lawyer says "accept this change"; use revert/reject
326 when they say "undo this change".
327
328 Returns
329 -------
330 (new_xml, info_dict)
331 info_dict contains ``accepted_ids``, ``not_found_ids``.
332 """
333 if not revision_ids:
334 return xml_str, {"error": "No revision IDs provided"}
335
336 id_set = set(int(rid) for rid in revision_ids)
337 accepted = []
338 not_found = []
339 result = xml_str
340
341 for rid in sorted(id_set):
342 rid_str = str(rid)
343 landed = False
344
345 ins_pattern = re.compile(
346 rf'<w:ins\b[^>]*w:id="{rid_str}"[^>]*>(.*?)</w:ins>',
347 re.DOTALL,
348 )
349 ins_match = ins_pattern.search(result)
350 if ins_match:
351 inner = ins_match.group(1)
352 result = result[:ins_match.start()] + inner + result[ins_match.end():]
353 landed = True
354
355 del_pattern = re.compile(
356 rf'<w:del\b[^>]*w:id="{rid_str}"[^>]*>.*?</w:del>',
357 re.DOTALL,
358 )
359 del_match = del_pattern.search(result)
360 if del_match:
361 result = result[:del_match.start()] + result[del_match.end():]
362 landed = True
363
364 if landed:
365 accepted.append(rid)
366 else:
367 not_found.append(rid)
368
369 result = _remove_empty_paragraphs(result)
370
371 return result, {"accepted_ids": accepted, "not_found_ids": not_found}
372
373def _remove_empty_paragraphs(xml_str: str) -> str:
374 """Remove ``<w:p>`` elements that contain no runs after revert cleanup.
375
376 A paragraph is considered empty and removable when it has no ``<w:r>``
377 elements and no ``<w:t>`` / ``<w:delText>`` elements — i.e. it contains
378 only ``<w:pPr>`` and whitespace. This happens when a cross-paragraph
379 INS-only paragraph is reverted (the ``<w:ins>`` is removed, leaving an
380 empty ``<w:p>``).
381 """
382 result = xml_str
383 para_re = re.compile(r'<w:p\b[^>]*>.*?</w:p>', re.DOTALL)
384
385 for m in reversed(list(para_re.finditer(result))):
386 para = m.group(0)
387 if not re.search(r'<w:r\b', para):
388 result = result[:m.start()] + result[m.end():]
389 return result
390
391_XML_ENTITY_DECODE = {
392 "&#x201C;": "\u201c",
393 "&#x201D;": "\u201d",
394 "&#x2018;": "\u2018",
395 "&#x2019;": "\u2019",
396 "&amp;": "&",
397 "&lt;": "<",
398 "&gt;": ">",
399 "&quot;": '"',
400 "&apos;": "'",
401}
402
403def _delete_empty_paragraph_marks(
404 xml_str: str,
405 revision_ids: list,
406 author: str = "Anylegal.ai",
407) -> tuple:
408 """Mark paragraph marks as deleted for paragraphs emptied by tracked changes.
409
410 When ``apply_text_edit`` or ``apply_range_delete`` wraps *all* text of a
411 paragraph in ``<w:del>`` (pure deletion, no ``<w:ins>``), Word keeps the
412 paragraph mark (¶) visible — creating blank lines both in tracked-change
413 view and after accepting. The OOXML fix is to add
414 ``<w:rPr><w:del .../></w:rPr>`` inside ``<w:pPr>``, which tells Word the
415 paragraph mark itself is deleted and should collapse when accepted.
416
417 Only processes paragraphs that:
418 1. Contain at least one of our *revision_ids*
419 2. Have at least one ``<w:del>`` block (were actually edited)
420 3. Have **no** visible text (no ``<w:t>`` outside ``<w:del>``, no ``<w:ins>``)
421
422 Returns ``(modified_xml, extra_revision_ids)``.
423 """
424 if not revision_ids:
425 return xml_str, []
426
427 date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
428 rev_strs = {str(rid) for rid in revision_ids}
429 next_id = max(revision_ids) + 1
430 extra_ids = []
431
432 p_re = re.compile(r"(<w:p\b[^>]*>)(.*?)(</w:p>)", re.DOTALL)
433 t_re = re.compile(r"<w:t\b[^>]*>[^<]*</w:t>", re.DOTALL)
434 ins_re = re.compile(r"<w:ins\b")
435 del_re = re.compile(r"<w:del\b")
436
437 result_parts = []
438 last_end = 0
439
440 for m in p_re.finditer(xml_str):
441 p_open, p_body, p_close = m.group(1), m.group(2), m.group(3)
442
443 if not any(f'w:id="{rid}"' in p_body for rid in rev_strs):
444 continue
445
446 if not del_re.search(p_body):
447 continue
448
449 if ins_re.search(p_body):
450 continue
451
452 has_visible_t = False
453 for t_m in t_re.finditer(p_body):
454 if not _is_inside_del_str(p_body, t_m.start()):
455 has_visible_t = True
456 break
457 if has_visible_t:
458 continue
459
460 ppr_re = re.compile(r"<w:pPr\b[^>]*>(.*?)</w:pPr>", re.DOTALL)
461 ppr_m = ppr_re.search(p_body)
462 if ppr_m and "<w:del " in ppr_m.group(1):
463 continue
464
465 del_attr = (
466 f'w:id="{next_id}" w:author="{author}" w:date="{date_str}"'
467 )
468
469 if ppr_m:
470
471 ppr_content = ppr_m.group(1)
472 rpr_re = re.compile(r"(<w:rPr\b[^>]*>)(.*?)(</w:rPr>)", re.DOTALL)
473 rpr_m = rpr_re.search(ppr_content)
474 if rpr_m:
475
476 new_rpr = (
477 rpr_m.group(1)
478 + rpr_m.group(2)
479 + f"<w:del {del_attr}/>"
480 + rpr_m.group(3)
481 )
482 new_ppr_inner = (
483 ppr_content[: rpr_m.start()]
484 + new_rpr
485 + ppr_content[rpr_m.end() :]
486 )
487 else:
488 new_ppr_inner = (
489 ppr_content + f"<w:rPr><w:del {del_attr}/></w:rPr>"
490 )
491 new_ppr = ppr_m.group(0).replace(ppr_m.group(1), new_ppr_inner, 1)
492 new_body = p_body[: ppr_m.start()] + new_ppr + p_body[ppr_m.end() :]
493 else:
494
495 new_body = (
496 f"<w:pPr><w:rPr><w:del {del_attr}/></w:rPr></w:pPr>" + p_body
497 )
498
499 new_p = p_open + new_body + p_close
500 result_parts.append(xml_str[last_end : m.start()])
501 result_parts.append(new_p)
502 last_end = m.end()
503 extra_ids.append(next_id)
504 next_id += 1
505
506 if not extra_ids:
507 return xml_str, []
508
509 result_parts.append(xml_str[last_end:])
510 return "".join(result_parts), extra_ids
511
512def _finalize_edit(result_xml: str, info: dict, author: str = "Anylegal.ai"):
513 """Post-process a successful edit:
514 1. Delete paragraph marks for emptied paragraphs.
515 2. Merge adjacent <w:ins>/<w:del> from the same author so iterative
516 edits don't fragment the markup. Ported from Anthropic's
517 simplify_redlines helper."""
518 result_xml, extra_ids = _delete_empty_paragraph_marks(
519 result_xml, info.get("revision_ids", []), author
520 )
521 if extra_ids:
522 info["revision_ids"] = info.get("revision_ids", []) + extra_ids
523
524 try:
525 from .tools.validators.simplify_redlines import simplify_redlines_xml
526 result_xml, merged = simplify_redlines_xml(result_xml)
527 if merged:
528 info["simplified_redlines"] = merged
529 logger.debug(f"[DOCX-XML] simplified {merged} adjacent tracked changes")
530 except Exception as e:
531 logger.debug(f"[DOCX-XML] simplify_redlines skipped: {e}")
532
533 return result_xml, info
534
535def apply_text_edit(
536 xml_str: str,
537 old_text: str,
538 new_text: str,
539 author: str = "Anylegal.ai",
540 near_text: str = "",
541) -> tuple:
542 """
543 Find plain text inside ``<w:t>`` elements and apply an OOXML tracked change.
544
545 The LLM sends human-readable *old_text* / *new_text*. This function
546 locates the text in the XML, splits the enclosing ``<w:r>``, and generates
547 ``<w:del>`` / ``<w:ins>`` markup preserving the original formatting.
548
549 Matching strategy (first match wins):
550 1. Single-run: exact / quote-normalised / case-insensitive match
551 within one ``<w:t>`` element.
552 2. Cross-run: concatenate all ``<w:t>`` texts in a paragraph,
553 match across run boundaries, preserve per-run formatting in
554 the ``<w:del>`` block.
555 3. Cross-paragraph: concatenate paragraph texts with separators,
556 match across ``<w:p>`` boundaries. Matched paragraphs become
557 ``<w:del>`` blocks; new text is split into ``<w:ins>`` paragraphs.
558
559 When *old_text* matches multiple locations (e.g., identical table cells),
560 *near_text* disambiguates by selecting the match closest to it.
561
562 Returns
563 -------
564 (new_xml_str, info_dict) on success
565 (None, error_dict) on failure
566 """
567 if not old_text:
568 return None, {"error": "old_text is empty"}
569
570 tc_ranges = _skip_ranges(xml_str)
571
572 t_re = re.compile(r"<w:t([^>]*)>([^<]*)</w:t>", re.DOTALL)
573
574 for mode in ("exact", "quotes", "icase"):
575 candidates = _scan_for_match(xml_str, t_re, tc_ranges, old_text, mode)
576 if candidates:
577 candidate = _resolve_candidates(xml_str, candidates, near_text, "run_span")
578 if candidate is None:
579 return None, _ambiguity_error(old_text, len(candidates))
580 result_xml, info = _build_single_run_change(xml_str, candidate, new_text, author)
581 info.setdefault("mode", f"single_run_{mode}")
582 return _finalize_edit(result_xml, info, author)
583
584 cross_candidates = _scan_cross_run(xml_str, t_re, tc_ranges, old_text)
585 if cross_candidates:
586 cross = _resolve_candidates(xml_str, cross_candidates, near_text, "p_start")
587 if cross is None:
588 return None, _ambiguity_error(old_text, len(cross_candidates))
589 result_xml, info = _build_cross_run_change(xml_str, cross, new_text, author)
590 info.setdefault("mode", "cross_run")
591 return _finalize_edit(result_xml, info, author)
592
593 cross_para = _scan_cross_paragraph(xml_str, t_re, tc_ranges, old_text)
594 if cross_para is not None:
595 result_xml, info = _build_cross_paragraph_change(
596 xml_str, cross_para, new_text, author
597 )
598 info["mode"] = "cross_paragraph"
599 return _finalize_edit(result_xml, info, author)
600
601 nearby = _nearby_paragraph_text(xml_str, old_text)
602
603 looks_like_table_row = (
604 (" | " in old_text or old_text.lstrip().startswith("|"))
605 and old_text.count("|") >= 2
606 )
607
608 partial = _longest_matching_span(xml_str, old_text) if not looks_like_table_row else None
609
610 glyph_hint = _diagnose_glyph_mismatch(old_text)
611
612 if looks_like_table_row:
613 suggestion = (
614 "Your old_text contains | pipe characters from the table display format. "
615 "Table cells are separate elements — you CANNOT edit across cells in one call. "
616 "Remove all | characters and edit ONE cell at a time. "
617 "Example: instead of '| cell1 | cell2 |', use just 'cell1' as old_text."
618 )
619 elif partial and len(partial) >= 12:
620 suggestion = (
621 f"Your old_text is too long — the document matches a shorter span. "
622 f"The longest matching segment is: '{partial[:120]}'. "
623 "Retry with just that text as old_text, and if you need more "
624 "context to disambiguate, pass near_text instead of extending "
625 "old_text. Stretching old_text across paragraph boundaries or "
626 "across a footnote / bookmark anchor is the #1 cause of "
627 "'Text not found' errors."
628 )
629 elif glyph_hint:
630 suggestion = (
631 f"{glyph_hint} "
632 "Retry with the exact character from the document. Use "
633 "read_document(view='text') to see the source verbatim."
634 )
635 else:
636 suggestion = (
637 "The text you sent does not match the document. Checklist: "
638 "(a) use read_document(view='text') and copy the EXACT span — "
639 "don't retype from memory; "
640 "(b) if the text spans punctuation like brackets, footnote "
641 "anchors, or bookmarks, try a SHORTER span that stops before "
642 "the anchor, then use near_text to disambiguate; "
643 "(c) confirm special characters (● vs •, smart vs straight "
644 "quotes, em vs en dash) match the source."
645 )
646
647 return None, {
648 "error": f"Text not found in document: '{old_text[:120]}'",
649 "suggestion": suggestion,
650 "nearby_text": nearby,
651 "longest_matching_prefix": partial,
652 }
653
654def _longest_matching_span(xml_str: str, old_text: str) -> str | None:
655 """Find the longest prefix / suffix / substring of ``old_text`` that
656 *is* present in the document's `<w:t>` text. Used to diagnose
657 "model over-reached on old_text" failures — the returned span is a
658 valid `old_text` the model can retry with.
659
660 Caps at 30 candidates per direction to keep this cheap on long
661 paragraphs. Returns ``None`` if no 12+ char substring matches.
662 """
663 if len(old_text) < 12:
664 return None
665
666 t_re = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
667 tc_ranges = _skip_ranges(xml_str)
668 chunks = []
669 for m in t_re.finditer(xml_str):
670 if _in_tracked_change(m.start(), tc_ranges):
671 continue
672 decoded = _decode_xml_text(m.group(1))
673 if decoded:
674 chunks.append(decoded)
675 concat = "".join(chunks)
676 if not concat:
677 return None
678
679 max_try = min(len(old_text), 400)
680 best = None
681
682 for length in range(max_try, 11, -max(1, max_try // 30)):
683 candidate = old_text[:length].rstrip()
684 if len(candidate) < 12:
685 continue
686 if candidate in concat or candidate.lower() in concat.lower():
687 best = candidate
688 break
689 return best
690
691def _diagnose_glyph_mismatch(old_text: str) -> str | None:
692 """Return a hint string if ``old_text`` contains characters that
693 LLMs commonly substitute for the actual document character. Returns
694 ``None`` if no confusables are present.
695 """
696 hints = []
697 if "" in old_text or "" in old_text:
698 hints.append(
699 "Bullet glyphs vary — documents may use ● (U+25CF), • (U+2022), "
700 "or ◦ (U+25E6); these are not interchangeable in exact-text "
701 "matching."
702 )
703 if "" in old_text or "" in old_text or "" in old_text:
704 hints.append(
705 "Dashes vary — em dash (—), en dash (–), and hyphen-minus (-) "
706 "are different characters."
707 )
708 if "" in old_text or "" in old_text or "" in old_text or "" in old_text:
709 hints.append(
710 "Smart quotes — the document may use straight (\" ') or "
711 "curly (“ ” ‘ ’) quotes; exact match distinguishes them."
712 )
713 return " ".join(hints) if hints else None
714
715def apply_range_delete(
716 xml_str: str,
717 start_text: str,
718 end_text: str,
719 author: str = "Anylegal.ai",
720) -> tuple:
721 """
722 Delete all paragraphs from the one containing *start_text* through the
723 one containing *end_text* (inclusive).
724
725 Each matched paragraph's runs are converted to ``<w:del>`` blocks with
726 ``<w:delText>``. Paragraphs that are already fully deleted (only contain
727 ``<w:del>`` content) or are empty are left as-is.
728
729 Returns ``(new_xml_str, info_dict)`` on success, ``(None, error_dict)``
730 on failure.
731 """
732 if not start_text or not end_text:
733 return None, {"error": "Both start_text and end_text are required."}
734
735 tc_ranges = _skip_ranges(xml_str)
736 p_re = re.compile(r"<w:p\b[^>]*>.*?</w:p>", re.DOTALL)
737 t_re = re.compile(r"<w:t([^>]*)>([^<]*)</w:t>", re.DOTALL)
738 t_br_re = re.compile(
739 r"<w:t([^>]*)>([^<]*)</w:t>|<w:br\b[^/]*/>",
740 re.DOTALL,
741 )
742
743 paras = []
744 for p_match in p_re.finditer(xml_str):
745 p_xml = p_match.group(0)
746 texts = []
747 for m in t_br_re.finditer(p_xml):
748 abs_pos = p_match.start() + m.start()
749 if _in_tracked_change(abs_pos, tc_ranges):
750 continue
751 if m.group(2) is not None:
752 decoded = _decode_xml_text(m.group(2))
753 if decoded:
754 texts.append(decoded)
755 else:
756 texts.append("\n")
757 paras.append({
758 "text": "".join(texts),
759 "p_start": p_match.start(),
760 "p_end": p_match.end(),
761 "p_xml": p_xml,
762 })
763
764 start_idx = None
765 for i, p in enumerate(paras):
766 if not p["text"].strip():
767 continue
768 if _text_contains(p["text"], start_text):
769 start_idx = i
770 break
771
772 if start_idx is None:
773 return None, {
774 "error": f"Start marker not found: '{start_text[:120]}'",
775 "suggestion": "Copy exact text from the document.",
776 }
777
778 end_idx = None
779 for i in range(len(paras) - 1, start_idx - 1, -1):
780 if not paras[i]["text"].strip():
781 continue
782 if _text_contains(paras[i]["text"], end_text):
783 end_idx = i
784 break
785
786 if end_idx is None:
787 return None, {
788 "error": f"End marker not found: '{end_text[:120]}'",
789 "suggestion": "Copy exact text from the document.",
790 }
791
792 if end_idx < start_idx:
793 return None, {
794 "error": "End marker appears before start marker.",
795 }
796
797 next_id = _next_revision_id(xml_str)
798 date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
799 id_counter = next_id
800 revision_ids = []
801
802 replacements = []
803 for i in range(start_idx, end_idx + 1):
804 p = paras[i]
805 para_text = p["text"].strip()
806 if not para_text:
807 continue
808
809 p_xml = p["p_xml"]
810 non_del_text = []
811 for m in t_re.finditer(p_xml):
812 abs_pos = p["p_start"] + m.start()
813 if not _in_tracked_change(abs_pos, tc_ranges):
814 non_del_text.append(_decode_xml_text(m.group(2)))
815 if not "".join(non_del_text).strip():
816 continue
817
818 ppr = _extract_ppr_str(p_xml)
819 sp = _space_attr(para_text)
820 rpr = ""
821
822 for m in t_re.finditer(p_xml):
823 abs_pos = p["p_start"] + m.start()
824 if _in_tracked_change(abs_pos, tc_ranges):
825 continue
826 try:
827 rs, re_ = _find_enclosing_run(xml_str, abs_pos)
828 rpr = _extract_rpr_str(xml_str[rs:re_])
829 except ValueError:
830 pass
831 break
832
833 del_para = (
834 f"<w:p>{ppr}"
835 f'<w:del w:id="{id_counter}" w:author="{author}" w:date="{date_str}">'
836 f"<w:r>{rpr}<w:delText{sp}>{_xml_escape_text(para_text)}</w:delText></w:r>"
837 f"</w:del>"
838 f"</w:p>"
839 )
840 replacements.append((p["p_start"], p["p_end"], del_para))
841 revision_ids.append(id_counter)
842 id_counter += 1
843
844 if not replacements:
845 return None, {"error": "No content to delete in the specified range."}
846
847 new_xml = xml_str
848 for p_start, p_end, del_para in reversed(replacements):
849 new_xml = new_xml[:p_start] + del_para + new_xml[p_end:]
850
851 info = {
852 "mode": "range_delete",
853 "matched_text": f"[{len(replacements)} paragraphs deleted]",
854 "revision_ids": revision_ids,
855 "paragraphs_deleted": len(replacements),
856 }
857 return _finalize_edit(new_xml, info, author)
858
859def _text_contains(haystack: str, needle: str) -> bool:
860 """Check if *haystack* contains *needle* with normalisation fallbacks."""
861 if needle in haystack:
862 return True
863 norm_h = _normalize_quotes(haystack)
864 norm_n = _normalize_quotes(needle)
865 if norm_n in norm_h:
866 return True
867 if norm_n.lower() in norm_h.lower():
868 return True
869 return False
870
871def _build_single_run_change(xml_str, candidate, new_text, author):
872 """Build tracked change for text found within a single ``<w:r>``.
873
874 If the matched run is inside an existing ``<w:ins>`` block, the text is
875 updated in-place (amending the insertion) rather than nesting tracked
876 changes, which matches how Word handles edits to unaccepted insertions.
877 """
878 run_start, run_end = candidate["run_span"]
879 run_xml = xml_str[run_start:run_end]
880 rpr = _extract_rpr_str(run_xml)
881
882 actual_old = candidate["actual_old"]
883 prefix = candidate["prefix"]
884 suffix = candidate["suffix"]
885
886 inside_ins = _is_inside_ins(run_start, xml_str)
887
888 if inside_ins:
889
890 ins_lines = [l for l in new_text.split('\n') if l.strip()] if new_text and '\n' in new_text else []
891 ins_multiline = len(ins_lines) > 1
892
893 if ins_multiline:
894
895 try:
896 p_start, p_end = _find_enclosing_paragraph(xml_str, run_start)
897 para_xml = xml_str[p_start:p_end]
898 ppr = _extract_ppr_str(para_xml)
899
900 ins_re = re.compile(r"<w:ins\b[^>]*>", re.DOTALL)
901 ins_tag = None
902 for im in ins_re.finditer(xml_str):
903 if im.start() <= run_start:
904 ins_end = xml_str.find("</w:ins>", im.start())
905 if ins_end != -1 and ins_end >= run_end:
906 ins_tag = im.group(0)
907 ins_open = ins_tag or '<w:ins w:id="0" w:author="Anylegal.ai">'
908
909 before_run = xml_str[p_start:run_start]
910 after_run = xml_str[run_end:p_end]
911
912 first_p = before_run
913 if prefix:
914 sp = _space_attr(prefix)
915 first_p += f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(prefix)}</w:t></w:r>"
916 first_p += _build_ins_runs(ins_lines[0], rpr)
917 if suffix:
918 sp = _space_attr(suffix)
919 first_p += f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(suffix)}</w:t></w:r>"
920 first_p += after_run
921
922 paras = [first_p]
923
924 base_rpr = _strip_rpr_bold(rpr)
925 base_ppr = _strip_ppr_numbering(ppr)
926 for line in ins_lines[1:]:
927 new_p = (
928 f"<w:p>{base_ppr}"
929 f"{ins_open}"
930 + _build_ins_runs(line, base_rpr)
931 + "</w:ins>"
932 "</w:p>"
933 )
934 paras.append(new_p)
935
936 replacement = "".join(paras)
937 new_xml = xml_str[:p_start] + replacement + xml_str[p_end:]
938
939 return new_xml, {
940 "matched_text": actual_old,
941 "replacement_text": new_text,
942 "mode": "ins_amend_multiline",
943 "paragraphs_added": len(ins_lines) - 1,
944 }
945 except ValueError:
946 pass
947
948 parts = []
949
950 if prefix:
951 sp = _space_attr(prefix)
952 parts.append(f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(prefix)}</w:t></w:r>")
953
954 if new_text:
955 parts.append(_build_ins_runs(new_text, rpr))
956
957 if suffix:
958 sp = _space_attr(suffix)
959 parts.append(f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(suffix)}</w:t></w:r>")
960
961 replacement = "".join(parts)
962 new_xml = xml_str[:run_start] + replacement + xml_str[run_end:]
963
964 return new_xml, {
965 "matched_text": actual_old,
966 "replacement_text": new_text,
967 "mode": "ins_amend",
968 }
969
970 next_id = _next_revision_id(xml_str)
971 date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
972
973 lines = [l for l in new_text.split('\n') if l.strip()] if new_text and '\n' in new_text else []
974 multiline = len(lines) > 1
975
976 if multiline:
977
978 try:
979 p_start, p_end = _find_enclosing_paragraph(xml_str, run_start)
980 para_xml = xml_str[p_start:p_end]
981 ppr = _extract_ppr_str(para_xml)
982
983 before_run = xml_str[p_start:run_start]
984 after_run = xml_str[run_end:p_end]
985
986 paras = []
987
988 first_p = before_run
989 if prefix:
990 sp = _space_attr(prefix)
991 first_p += f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(prefix)}</w:t></w:r>"
992
993 sp_d = _space_attr(actual_old)
994 first_p += (
995 f'<w:del w:id="{next_id}" w:author="{author}" w:date="{date_str}">'
996 f"<w:r>{rpr}<w:delText{sp_d}>{_xml_escape_text(actual_old)}</w:delText></w:r>"
997 f"</w:del>"
998 )
999
1000 first_p += (
1001 f'<w:ins w:id="{next_id + 1}" w:author="{author}" w:date="{date_str}">'
1002 + _build_ins_runs(lines[0], rpr)
1003 + "</w:ins>"
1004 )
1005
1006 if suffix:
1007 sp = _space_attr(suffix)
1008 first_p += f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(suffix)}</w:t></w:r>"
1009
1010 first_p += after_run
1011 paras.append(first_p)
1012
1013 base_rpr = _strip_rpr_bold(rpr)
1014 base_ppr = _strip_ppr_numbering(ppr)
1015 for i, line in enumerate(lines[1:], start=2):
1016 ins_id = next_id + i
1017 new_p = (
1018 f"<w:p>{base_ppr}"
1019 f'<w:ins w:id="{ins_id}" w:author="{author}" w:date="{date_str}">'
1020 + _build_ins_runs(line, base_rpr)
1021 + "</w:ins>"
1022 "</w:p>"
1023 )
1024 paras.append(new_p)
1025
1026 replacement = "".join(paras)
1027 new_xml = xml_str[:p_start] + replacement + xml_str[p_end:]
1028
1029 return new_xml, {
1030 "matched_text": actual_old,
1031 "replacement_text": new_text,
1032 "paragraphs_added": len(lines) - 1,
1033 "revision_ids": list(range(next_id, next_id + len(lines) + 1)),
1034 }
1035 except ValueError:
1036 pass
1037
1038 parts = []
1039
1040 if prefix:
1041 sp = _space_attr(prefix)
1042 parts.append(f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(prefix)}</w:t></w:r>")
1043
1044 sp_d = _space_attr(actual_old)
1045 parts.append(
1046 f'<w:del w:id="{next_id}" w:author="{author}" w:date="{date_str}">'
1047 f"<w:r>{rpr}<w:delText{sp_d}>{_xml_escape_text(actual_old)}</w:delText></w:r>"
1048 f"</w:del>"
1049 )
1050
1051 if new_text:
1052 parts.append(
1053 f'<w:ins w:id="{next_id + 1}" w:author="{author}" w:date="{date_str}">'
1054 + _build_ins_runs(new_text, rpr)
1055 + "</w:ins>"
1056 )
1057
1058 if suffix:
1059 sp = _space_attr(suffix)
1060 parts.append(f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(suffix)}</w:t></w:r>")
1061
1062 replacement = "".join(parts)
1063 new_xml = xml_str[:run_start] + replacement + xml_str[run_end:]
1064
1065 return new_xml, {
1066 "matched_text": actual_old,
1067 "replacement_text": new_text,
1068 "revision_ids": [next_id] + ([next_id + 1] if new_text else []),
1069 }
1070
1071def _scan_cross_run(xml_str, t_re, tc_ranges, old_text):
1072 """
1073 Search for *old_text* spanning multiple ``<w:r>`` elements within a
1074 single paragraph.
1075
1076 Returns a **list** of candidate dicts (empty if no matches).
1077 """
1078 p_re = re.compile(r"<w:p\b[^>]*>(.*?)</w:p>", re.DOTALL)
1079
1080 t_br_re = re.compile(
1081 r"<w:t([^>]*)>([^<]*)</w:t>|<w:br\b[^/]*/>",
1082 re.DOTALL,
1083 )
1084
1085 candidates = []
1086
1087 for p_match in p_re.finditer(xml_str):
1088 p_content = p_match.group(1)
1089 p_offset = p_match.start(1)
1090
1091 runs = []
1092 concat = ""
1093
1094 for m in t_br_re.finditer(p_content):
1095 abs_pos = p_offset + m.start()
1096 if _in_tracked_change(abs_pos, tc_ranges):
1097 continue
1098
1099 if m.group(2) is not None:
1100
1101 decoded = _decode_xml_text(m.group(2))
1102 if not decoded:
1103 continue
1104
1105 try:
1106 run_start, run_end = _find_enclosing_run(xml_str, abs_pos)
1107 except ValueError:
1108 continue
1109
1110 rpr = _extract_rpr_str(xml_str[run_start:run_end])
1111
1112 runs.append({
1113 "decoded": decoded,
1114 "text_start": len(concat),
1115 "text_end": len(concat) + len(decoded),
1116 "run_start": run_start,
1117 "run_end": run_end,
1118 "rpr": rpr,
1119 })
1120 concat += decoded
1121 else:
1122
1123 if runs:
1124 runs[-1]["decoded"] += "\n"
1125 runs[-1]["text_end"] += 1
1126 concat += "\n"
1127
1128 if len(runs) < 2:
1129 continue
1130
1131 match_pos = _find_in_paragraph(concat, old_text)
1132 if match_pos is None:
1133 continue
1134
1135 match_start, actual_old = match_pos
1136 match_end = match_start + len(actual_old)
1137
1138 start_idx = end_idx = None
1139 for i, run in enumerate(runs):
1140 if start_idx is None and run["text_end"] > match_start:
1141 start_idx = i
1142 if run["text_start"] < match_end:
1143 end_idx = i
1144
1145 if start_idx is None or end_idx is None or start_idx == end_idx:
1146 continue
1147
1148 candidates.append({
1149 "runs": runs,
1150 "start_idx": start_idx,
1151 "end_idx": end_idx,
1152 "match_start": match_start,
1153 "match_end": match_end,
1154 "actual_old": actual_old,
1155 "p_start": p_match.start(),
1156 })
1157
1158 return candidates
1159
1160def _normalize_for_match(text: str) -> str:
1161 """Apply all normalisations: quotes + whitespace + symbols."""
1162 return _normalize_whitespace(_normalize_quotes(_normalize_symbols(text)))
1163
1164def _find_in_paragraph(concat, old_text):
1165 """
1166 Try to find *old_text* in concatenated paragraph text.
1167
1168 Returns ``(start_pos, actual_old_text)`` or ``None``.
1169 Uses exact → normalised → case-insensitive → whitespace-agnostic fallback.
1170 """
1171
1172 if old_text in concat:
1173 return (concat.index(old_text), old_text)
1174
1175 norm_old = _normalize_for_match(old_text)
1176 norm_concat = _normalize_for_match(concat)
1177 if norm_old in norm_concat:
1178 idx = norm_concat.index(norm_old)
1179 return (idx, concat[idx : idx + len(old_text)])
1180
1181 lower_old = norm_old.lower()
1182 lower_concat = norm_concat.lower()
1183 if lower_old in lower_concat:
1184 idx = lower_concat.index(lower_old)
1185 return (idx, concat[idx : idx + len(old_text)])
1186
1187 result = _find_whitespace_agnostic(concat, old_text)
1188 if result is not None:
1189 return result
1190
1191 return _find_fillblank_agnostic(concat, old_text)
1192
1193def _find_whitespace_agnostic(concat, old_text):
1194 """Match ``old_text`` against ``concat`` ignoring whitespace.
1195
1196 Returns ``(start_pos_in_concat, actual_old_text_in_concat)`` or ``None``.
1197 The returned ``actual`` span is the raw (whitespace-preserving) slice of
1198 ``concat`` — so downstream tracked-change generation operates on exactly
1199 the text the document stores, not the LLM's reshaped version.
1200 """
1201
1202 norm_old = _normalize_for_match(old_text)
1203 norm_concat = _normalize_for_match(concat)
1204
1205 stripped_chars = []
1206 index_map = []
1207 for i, ch in enumerate(norm_concat):
1208 if not ch.isspace():
1209 stripped_chars.append(ch)
1210 index_map.append(i)
1211 stripped_concat = "".join(stripped_chars)
1212
1213 stripped_old = "".join(c for c in norm_old if not c.isspace())
1214 if not stripped_old:
1215 return None
1216
1217 pos = stripped_concat.find(stripped_old)
1218 if pos < 0:
1219 pos = stripped_concat.lower().find(stripped_old.lower())
1220 if pos < 0:
1221 return None
1222
1223 concat_start = index_map[pos]
1224 concat_end_char = index_map[pos + len(stripped_old) - 1]
1225
1226 concat_end = concat_end_char + 1
1227 actual = concat[concat_start:concat_end]
1228 return (concat_start, actual)
1229
1230_FILLBLANK_RUN_RE = re.compile(
1231 r"(?:[.…․‥·_]\s*){1,}[.…․‥·_]"
1232 r"|[.…․‥·_]{2,}"
1233)
1234
1235def _find_fillblank_agnostic(concat, old_text):
1236 """Match ``old_text`` against ``concat`` ignoring fill-blank runs.
1237
1238 Returns ``(start_pos_in_concat, actual_old_text_in_concat)`` or ``None``.
1239
1240 A "fill-blank run" is 2+ consecutive characters drawn from the set
1241 ``[. … ․ ‥ · _]`` (possibly with whitespace inside the run). Both the
1242 document and the LLM-supplied old_text get those runs deleted before
1243 matching; positions in the stripped space are mapped back so the
1244 returned actual span covers exactly what the document stores
1245 (whitespace, fill-blanks and all).
1246
1247 Single dots (sentence punctuation) are NOT stripped — only runs of 2+.
1248
1249 Whitespace is also stripped, mirroring _find_whitespace_agnostic, so
1250 callers reaching this fallback don't need to call both.
1251 """
1252
1253 norm_old = _normalize_for_match(old_text)
1254 norm_concat = _normalize_for_match(concat)
1255
1256 def _mark_fillblanks(s: str):
1257 mark = [False] * len(s)
1258 for m in _FILLBLANK_RUN_RE.finditer(s):
1259 for i in range(m.start(), m.end()):
1260 mark[i] = True
1261 return mark
1262
1263 concat_mark = _mark_fillblanks(norm_concat)
1264 old_mark = _mark_fillblanks(norm_old)
1265
1266 stripped_concat_chars = []
1267 index_map = []
1268 for i, ch in enumerate(norm_concat):
1269 if concat_mark[i] or ch.isspace():
1270 continue
1271 stripped_concat_chars.append(ch)
1272 index_map.append(i)
1273 stripped_concat = "".join(stripped_concat_chars)
1274
1275 stripped_old = "".join(
1276 ch for i, ch in enumerate(norm_old) if not old_mark[i] and not ch.isspace()
1277 )
1278 if not stripped_old:
1279 return None
1280
1281 pos = stripped_concat.find(stripped_old)
1282 if pos < 0:
1283 pos = stripped_concat.lower().find(stripped_old.lower())
1284 if pos < 0:
1285 return None
1286
1287 concat_start = index_map[pos]
1288 concat_end_char = index_map[pos + len(stripped_old) - 1]
1289 concat_end = concat_end_char + 1
1290 actual = concat[concat_start:concat_end]
1291 return (concat_start, actual)
1292
1293def _dominant_rpr(runs, start_idx, end_idx, match_start, match_end):
1294 """Return the rPr of the run contributing the most characters to the match.
1295
1296 When a match spans runs with different formatting (e.g. a short bold label
1297 run followed by a long non-bold body run), using the first run's rPr for
1298 the insertion causes the entire replacement to inherit the minority
1299 formatting. Picking the *dominant* (most-characters) run's rPr avoids
1300 this: a 7-char bold "Number:" label loses to a 40-char non-bold body,
1301 so the insertion is correctly non-bold.
1302 """
1303 best_rpr = runs[start_idx]["rpr"]
1304 best_count = 0
1305 for i in range(start_idx, end_idx + 1):
1306 run = runs[i]
1307 seg_start = max(match_start, run["text_start"]) - run["text_start"]
1308 seg_end = min(match_end, run["text_end"]) - run["text_start"]
1309 count = seg_end - seg_start
1310 if count > best_count:
1311 best_count = count
1312 best_rpr = run["rpr"]
1313 return best_rpr
1314
1315def _build_cross_run_change(xml_str, cross, new_text, author):
1316 """
1317 Build tracked change for text spanning multiple ``<w:r>`` elements.
1318
1319 Each affected run contributes its own ``<w:r>`` with formatting inside
1320 the ``<w:del>`` block, so per-run formatting is preserved in the deletion.
1321 The insertion uses the formatting of the dominant (most-characters) run.
1322
1323 If the runs are inside an existing ``<w:ins>`` block, the text is updated
1324 in-place (amending the insertion) without nested tracked changes.
1325 """
1326 runs = cross["runs"]
1327 start_idx = cross["start_idx"]
1328 end_idx = cross["end_idx"]
1329 match_start = cross["match_start"]
1330 match_end = cross["match_end"]
1331 actual_old = cross["actual_old"]
1332
1333 start_run = runs[start_idx]
1334 end_run = runs[end_idx]
1335
1336 prefix_offset = match_start - start_run["text_start"]
1337 prefix = start_run["decoded"][:prefix_offset]
1338
1339 suffix_offset = match_end - end_run["text_start"]
1340 suffix = end_run["decoded"][suffix_offset:]
1341
1342 replace_start = start_run["run_start"]
1343 replace_end = end_run["run_end"]
1344
1345 inside_ins = _is_inside_ins(replace_start, xml_str)
1346
1347 if inside_ins:
1348
1349 parts = []
1350 if prefix:
1351 sp = _space_attr(prefix)
1352 parts.append(
1353 f"<w:r>{start_run['rpr']}<w:t{sp}>"
1354 f"{_xml_escape_text(prefix)}</w:t></w:r>"
1355 )
1356 if new_text:
1357 parts.append(_build_ins_runs(new_text, start_run['rpr']))
1358 if suffix:
1359 sp = _space_attr(suffix)
1360 parts.append(
1361 f"<w:r>{end_run['rpr']}<w:t{sp}>"
1362 f"{_xml_escape_text(suffix)}</w:t></w:r>"
1363 )
1364 replacement = "".join(parts)
1365 new_xml = xml_str[:replace_start] + replacement + xml_str[replace_end:]
1366 return new_xml, {
1367 "matched_text": actual_old,
1368 "replacement_text": new_text,
1369 "cross_run": True,
1370 "runs_affected": end_idx - start_idx + 1,
1371 "mode": "ins_amend",
1372 }
1373
1374 next_id = _next_revision_id(xml_str)
1375 date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
1376
1377 parts = []
1378
1379 if prefix:
1380 sp = _space_attr(prefix)
1381 parts.append(
1382 f"<w:r>{start_run['rpr']}<w:t{sp}>"
1383 f"{_xml_escape_text(prefix)}</w:t></w:r>"
1384 )
1385
1386 del_runs = []
1387 for i in range(start_idx, end_idx + 1):
1388 run = runs[i]
1389 del_start = max(match_start, run["text_start"]) - run["text_start"]
1390 del_end = min(match_end, run["text_end"]) - run["text_start"]
1391 del_text = run["decoded"][del_start:del_end]
1392 if del_text:
1393 sp = _space_attr(del_text)
1394 del_runs.append(
1395 f"<w:r>{run['rpr']}<w:delText{sp}>"
1396 f"{_xml_escape_text(del_text)}</w:delText></w:r>"
1397 )
1398
1399 parts.append(
1400 f'<w:del w:id="{next_id}" w:author="{author}" w:date="{date_str}">'
1401 + "".join(del_runs)
1402 + "</w:del>"
1403 )
1404
1405 if new_text:
1406 ins_rpr = _dominant_rpr(runs, start_idx, end_idx, match_start, match_end)
1407 parts.append(
1408 f'<w:ins w:id="{next_id + 1}" w:author="{author}" w:date="{date_str}">'
1409 + _build_ins_runs(new_text, ins_rpr)
1410 + "</w:ins>"
1411 )
1412
1413 if suffix:
1414 sp = _space_attr(suffix)
1415 parts.append(
1416 f"<w:r>{end_run['rpr']}<w:t{sp}>"
1417 f"{_xml_escape_text(suffix)}</w:t></w:r>"
1418 )
1419
1420 replacement = "".join(parts)
1421 new_xml = xml_str[:replace_start] + replacement + xml_str[replace_end:]
1422
1423 return new_xml, {
1424 "matched_text": actual_old,
1425 "replacement_text": new_text,
1426 "cross_run": True,
1427 "runs_affected": end_idx - start_idx + 1,
1428 "revision_ids": [next_id] + ([next_id + 1] if new_text else []),
1429 }
1430
1431def _scan_cross_paragraph(xml_str, t_re, tc_ranges, old_text):
1432 """
1433 Search for *old_text* spanning multiple ``<w:p>`` elements.
1434
1435 Collects paragraph text, concatenates with separators, and attempts
1436 to locate *old_text* in the concatenated string. Tries ``\\n\\n``,
1437 ``\\n``, and space as separators to account for how the LLM may
1438 collapse paragraph boundaries.
1439
1440 Returns a candidate dict or ``None``.
1441 """
1442 p_re = re.compile(r"<w:p\b[^>]*>.*?</w:p>", re.DOTALL)
1443
1444 paras = []
1445 for p_match in p_re.finditer(xml_str):
1446 p_xml = p_match.group(0)
1447
1448 texts = []
1449 first_rpr = None
1450 t_br_re = re.compile(
1451 r"<w:t([^>]*)>([^<]*)</w:t>|<w:br\b[^/]*/>",
1452 re.DOTALL,
1453 )
1454 for m in t_br_re.finditer(p_xml):
1455 abs_pos = p_match.start() + m.start()
1456 if _in_tracked_change(abs_pos, tc_ranges):
1457 continue
1458 if m.group(2) is not None:
1459 decoded = _decode_xml_text(m.group(2))
1460 if decoded:
1461 texts.append(decoded)
1462 if first_rpr is None:
1463 try:
1464 rs, re_ = _find_enclosing_run(xml_str, abs_pos)
1465 first_rpr = _extract_rpr_str(xml_str[rs:re_])
1466 except ValueError:
1467 pass
1468 else:
1469
1470 texts.append("\n")
1471
1472 para_text = "".join(texts)
1473 paras.append({
1474 "text": para_text,
1475 "p_start": p_match.start(),
1476 "p_end": p_match.end(),
1477 "ppr": _extract_ppr_str(p_xml),
1478 "rpr": first_rpr or "",
1479 "ctx": _structural_context(xml_str, p_match.start()),
1480 })
1481
1482 if len(paras) < 2:
1483 return None
1484
1485 norm_old = _normalize_for_match(old_text)
1486
1487 for sep in ("\n\n", "\n", " "):
1488 concat = sep.join(p["text"] for p in paras)
1489 norm_concat = _normalize_for_match(concat)
1490
1491 search_start = 0
1492 best_candidate = None
1493
1494 while True:
1495 idx = norm_concat.find(norm_old, search_start)
1496 if idx == -1:
1497 idx = norm_concat.lower().find(norm_old.lower(), search_start)
1498 if idx < 0:
1499 break
1500
1501 char_pos = 0
1502 start_para = end_para = None
1503 match_end = idx + len(norm_old)
1504
1505 for i, p in enumerate(paras):
1506 p_len = len(_normalize_for_match(p["text"]))
1507 p_end_pos = char_pos + p_len
1508
1509 if start_para is None and idx < p_end_pos:
1510 start_para = i
1511 if match_end <= p_end_pos:
1512 end_para = i
1513 break
1514
1515 char_pos = p_end_pos + len(sep)
1516
1517 if start_para is not None and end_para is not None and start_para != end_para:
1518
1519 ctxs = {paras[i]["ctx"] for i in range(start_para, end_para + 1)
1520 if paras[i]["text"].strip()}
1521 if len(ctxs) > 1:
1522 search_start = idx + 1
1523 continue
1524
1525 candidate = {
1526 "paras": paras,
1527 "start_para": start_para,
1528 "end_para": end_para,
1529 "actual_old": old_text,
1530 }
1531
1532 all_ins = all(
1533 _is_ins_only_paragraph(xml_str, paras[i]["p_start"], paras[i]["p_end"])
1534 for i in range(start_para, end_para + 1)
1535 if paras[i]["text"].strip()
1536 )
1537
1538 if not all_ins:
1539 return candidate
1540
1541 if best_candidate is None:
1542 best_candidate = candidate
1543 search_start = idx + 1
1544 else:
1545 break
1546
1547 if best_candidate is not None:
1548 return best_candidate
1549
1550 return None
1551
1552def _is_ins_only_paragraph(xml_str: str, p_start: int, p_end: int) -> bool:
1553 """Check if a paragraph's visible content is entirely inside ``<w:ins>`` blocks.
1554
1555 Returns True when all ``<w:t>`` elements in the paragraph are enclosed by
1556 ``<w:ins>``. In Word's track-changes model, deleting such text should
1557 simply *remove* the insertion (un-insert it), not create a ``<w:del>`` mark
1558 for text that was never in the original document.
1559 """
1560 p_xml = xml_str[p_start:p_end]
1561 t_re = re.compile(r"<w:t\b[^>]*>[^<]*</w:t>", re.DOTALL)
1562 ins_re = re.compile(r"<w:ins\b[^>]*>.*?</w:ins>", re.DOTALL)
1563 del_re_local = re.compile(r"<w:del\b[^>]*>.*?</w:del>", re.DOTALL)
1564
1565 ins_ranges = [(m.start(), m.end()) for m in ins_re.finditer(p_xml)]
1566 del_ranges = [(m.start(), m.end()) for m in del_re_local.finditer(p_xml)]
1567
1568 if not ins_ranges:
1569 return False
1570
1571 found_visible_t = False
1572 for t_m in t_re.finditer(p_xml):
1573 t_pos = t_m.start()
1574
1575 if any(s <= t_pos < e for s, e in del_ranges):
1576 continue
1577 found_visible_t = True
1578
1579 if not any(s <= t_pos < e for s, e in ins_ranges):
1580 return False
1581
1582 return found_visible_t
1583
1584def _build_cross_paragraph_change(xml_str, cross, new_text, author):
1585 """
1586 Build tracked change for text spanning multiple ``<w:p>`` elements.
1587
1588 DEL and INS are paired **in the same** ``<w:p>`` element whenever
1589 possible. This is critical for heading-numbered documents: separate
1590 ``<w:p>`` elements each consume a heading number, leaving "blank"
1591 clause slots for the DEL paragraphs. Placing DEL + INS in the same
1592 ``<w:p>`` makes them share one heading number, which is correct.
1593
1594 Paragraphs whose visible content is entirely inside ``<w:ins>`` blocks
1595 (from a previous edit) are simply **dropped** instead of being wrapped
1596 in ``<w:del>``. In Word's track-changes model, deleting a tracked
1597 insertion just removes it — it must not appear as a deletion of text
1598 that was never in the original document.
1599
1600 Layout rules:
1601 - Paired (1:1): ``<w:p> pPr + DEL + INS </w:p>``
1602 - Extra DEL (more old than new): ``<w:p> pPr + DEL </w:p>``
1603 - Extra INS (more new than old): ``<w:p> pPr + INS </w:p>``
1604 - Ins-only paragraph (previous insertion): dropped entirely
1605 """
1606 paras = cross["paras"]
1607 start_idx = cross["start_para"]
1608 end_idx = cross["end_para"]
1609 actual_old = cross["actual_old"]
1610
1611 replace_start = paras[start_idx]["p_start"]
1612 replace_end = paras[end_idx]["p_end"]
1613
1614 matched = []
1615 for i in range(start_idx, end_idx + 1):
1616 p = paras[i]
1617 if not p["text"].strip():
1618 continue
1619 p["ins_only"] = _is_ins_only_paragraph(
1620 xml_str, p["p_start"], p["p_end"]
1621 )
1622 matched.append(p)
1623
1624 real_matched = [p for p in matched if not p.get("ins_only")]
1625 ins_only_dropped = [p for p in matched if p.get("ins_only")]
1626
1627 fallback_rpr = (real_matched[0]["rpr"] if real_matched
1628 else matched[0]["rpr"] if matched else "")
1629
1630 next_id = _next_revision_id(xml_str)
1631 date_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
1632
1633 result_parts = []
1634 id_counter = next_id
1635
1636 new_lines = [l for l in new_text.split('\n') if l.strip()] if new_text else []
1637 n_del = len(real_matched)
1638 n_ins = len(new_lines)
1639 n_paired = min(n_del, n_ins)
1640
1641 for i in range(n_paired):
1642 p = real_matched[i]
1643 line = new_lines[i]
1644 p_rpr = p["rpr"] or fallback_rpr
1645 sp_d = _space_attr(p["text"])
1646
1647 result_parts.append(
1648 f"<w:p>{p['ppr']}"
1649 f'<w:del w:id="{id_counter}" w:author="{author}" w:date="{date_str}">'
1650 f"<w:r>{p_rpr}<w:delText{sp_d}>{_xml_escape_text(p['text'])}</w:delText></w:r>"
1651 f"</w:del>"
1652 f'<w:ins w:id="{id_counter + 1}" w:author="{author}" w:date="{date_str}">'
1653 + _build_ins_runs(line, p_rpr)
1654 + "</w:ins>"
1655 "</w:p>"
1656 )
1657 id_counter += 2
1658
1659 for i in range(n_paired, n_del):
1660 p = real_matched[i]
1661 p_rpr = p["rpr"] or fallback_rpr
1662 sp = _space_attr(p["text"])
1663 result_parts.append(
1664 f"<w:p>{p['ppr']}"
1665 f'<w:del w:id="{id_counter}" w:author="{author}" w:date="{date_str}">'
1666 f"<w:r>{p_rpr}<w:delText{sp}>{_xml_escape_text(p['text'])}</w:delText></w:r>"
1667 f"</w:del>"
1668 f"</w:p>"
1669 )
1670 id_counter += 1
1671
1672 for i in range(n_paired, n_ins):
1673 line = new_lines[i]
1674 src = (real_matched[min(i, n_del - 1)] if real_matched
1675 else matched[min(i, len(matched) - 1)] if matched
1676 else {"ppr": "", "rpr": ""})
1677 ppr = _strip_ppr_numbering(src["ppr"])
1678 rpr = _strip_rpr_bold(src["rpr"] or fallback_rpr)
1679 result_parts.append(
1680 f"<w:p>{ppr}"
1681 f'<w:ins w:id="{id_counter}" w:author="{author}" w:date="{date_str}">'
1682 + _build_ins_runs(line, rpr)
1683 + "</w:ins>"
1684 "</w:p>"
1685 )
1686 id_counter += 1
1687
1688 replacement = "".join(result_parts)
1689 new_xml = xml_str[:replace_start] + replacement + xml_str[replace_end:]
1690
1691 return new_xml, {
1692 "matched_text": actual_old,
1693 "replacement_text": new_text,
1694 "cross_paragraph": True,
1695 "paragraphs_matched": len(matched),
1696 "paragraphs_replaced": n_ins,
1697 "paragraphs_dropped": len(ins_only_dropped),
1698 "revision_ids": list(range(next_id, id_counter)),
1699 }
1700
1701def _structural_context(xml_str: str, pos: int) -> tuple:
1702 """Return a structural nesting key for a position in the XML.
1703
1704 Two paragraphs have the same context iff they are in the same table
1705 cell (or both at body level). Uses both nesting depth AND cumulative
1706 ``</w:tc>`` close count to distinguish different cells at the same
1707 depth — plain depth counting gives identical tuples for adjacent cells.
1708
1709 Cross-paragraph edits that span different contexts would cut across
1710 ``<w:tc>``/``<w:tr>``/``<w:tbl>`` wrappers, producing invalid XML.
1711 """
1712 prefix = xml_str[:pos]
1713 tbl = prefix.count("<w:tbl") - prefix.count("</w:tbl>")
1714 tr = prefix.count("<w:tr") - prefix.count("</w:tr>")
1715 tc = prefix.count("<w:tc") - prefix.count("</w:tc>")
1716 if tc > 0:
1717
1718 tc_seq = prefix.count("</w:tc>")
1719 return (tbl, tr, tc, tc_seq)
1720 return (tbl, tr, tc)
1721
1722def _decode_xml_text(raw: str) -> str:
1723 """Decode XML entities in ``<w:t>`` content to plain Unicode."""
1724 result = raw
1725 for entity, char in _XML_ENTITY_DECODE.items():
1726 result = result.replace(entity, char)
1727
1728 result = re.sub(r"&#x([0-9a-fA-F]+);", lambda m: chr(int(m.group(1), 16)), result)
1729 result = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), result)
1730 return result
1731
1732def _normalize_quotes(text: str) -> str:
1733 """Collapse smart/curly quotes to ASCII for comparison."""
1734 return (
1735 text
1736 .replace("\u201c", '"').replace("\u201d", '"')
1737 .replace("\u2018", "'").replace("\u2019", "'")
1738 )
1739
1740def _normalize_symbols(text: str) -> str:
1741 """Normalise confusable bullet/dash/ellipsis characters for comparison.
1742
1743 LLMs often substitute visually-similar Unicode symbols when reproducing
1744 document text (e.g. ● U+25CF ↔ • U+2022). Mapping them to a single
1745 canonical form prevents spurious match failures.
1746 """
1747 return (
1748 text
1749
1750 .replace("\u25cf", "\u2022")
1751 .replace("\u25cb", "\u25e6")
1752 .replace("\u25a0", "\u25aa")
1753
1754 .replace("\u2014", "\u2013")
1755 .replace("\u2012", "\u2013")
1756
1757 .replace("\u2026", "...")
1758 )
1759
1760def _normalize_whitespace(text: str) -> str:
1761 """Normalise non-breaking spaces and other Unicode whitespace to ASCII space."""
1762
1763 return re.sub(r"[\xa0\u2002\u2003\u2009\u200a\u202f]", " ", text)
1764
1765def _xml_escape_text(text: str) -> str:
1766 """Escape plain text for XML element content, re-escaping smart quotes."""
1767 text = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
1768 for char, entity in SMART_QUOTE_REPLACEMENTS.items():
1769 text = text.replace(char, entity)
1770 return text
1771
1772def _space_attr(text: str) -> str:
1773 """Return ``xml:space='preserve'`` attr string if *text* has edge whitespace."""
1774 if text and (text[0] in " \t" or text[-1] in " \t"):
1775 return ' xml:space="preserve"'
1776 return ""
1777
1778_MD_BOLD_RE = re.compile(r"\*\*(.+?)\*\*", re.DOTALL)
1779
1780def _parse_md_bold(text: str) -> list:
1781 """Parse ``**bold**`` markers in *text* into segments.
1782
1783 Returns a list of ``(segment_text, is_bold)`` tuples.
1784 Only matches double-asterisk (``**``). Single ``*`` is literal.
1785
1786 Fast-path: if no ``**`` is present, returns a single non-bold segment.
1787 """
1788 if "**" not in text:
1789 return [(text, False)]
1790
1791 segments = []
1792 last_end = 0
1793 for m in _MD_BOLD_RE.finditer(text):
1794
1795 if m.start() > last_end:
1796 segments.append((text[last_end:m.start()], False))
1797
1798 segments.append((m.group(1), True))
1799 last_end = m.end()
1800
1801 if last_end < len(text):
1802 segments.append((text[last_end:], False))
1803
1804 return [(t, b) for t, b in segments if t]
1805
1806def _modify_rpr_bold(rpr_str: str, bold: bool) -> str:
1807 """Return *rpr_str* with ``<w:b/>`` added when *bold* is True.
1808
1809 Handles:
1810 - Empty string → ``<w:rPr><w:b/></w:rPr>``
1811 - Self-closing ``<w:rPr/>`` → ``<w:rPr><w:b/></w:rPr>``
1812 - Already has ``<w:b/>`` → no change
1813 - Has ``<w:b w:val="0"/>`` (explicit off) → replaced with ``<w:b/>``
1814 - Non-bold segment → returns *rpr_str* unchanged
1815 """
1816 if not bold:
1817 return rpr_str
1818
1819 if re.search(r"<w:b\s*/>", rpr_str) or re.search(r'<w:b\s+w:val="1"\s*/>', rpr_str):
1820 return rpr_str
1821
1822 result = re.sub(r'<w:b\s+w:val="0"\s*/>', "", rpr_str)
1823
1824 if not result:
1825 return "<w:rPr><w:b/></w:rPr>"
1826 if re.match(r"<w:rPr\s*/>", result):
1827 return "<w:rPr><w:b/></w:rPr>"
1828 if "<w:rPr" in result:
1829 return re.sub(r"(<w:rPr\b[^>]*>)", r"\1<w:b/>", result, count=1)
1830 return "<w:rPr><w:b/></w:rPr>"
1831
1832def _strip_rpr_bold(rpr_str: str) -> str:
1833 """Remove ``<w:b/>`` and ``<w:b w:val="1"/>`` from an rPr XML string.
1834
1835 Used for multi-paragraph insertions where additional lines should not
1836 inherit bold formatting from the original matched heading/run.
1837 """
1838 if not rpr_str:
1839 return rpr_str
1840 result = re.sub(r'<w:b\s*/>', '', rpr_str)
1841 result = re.sub(r'<w:b\s+w:val="1"\s*/>', '', result)
1842 return result
1843
1844def _strip_ppr_numbering(ppr_str: str) -> str:
1845 """Remove ``<w:numPr>...</w:numPr>`` from a pPr XML string.
1846
1847 Used for multi-paragraph insertions where additional lines should not
1848 inherit the heading-level numbering from the original matched paragraph.
1849 """
1850 if not ppr_str:
1851 return ppr_str
1852 return re.sub(r'<w:numPr>.*?</w:numPr>', '', ppr_str, flags=re.DOTALL)
1853
1854def _build_ins_runs(text: str, base_rpr: str) -> str:
1855 """Generate ``<w:r>`` elements from *text*, translating ``**bold**`` markers.
1856
1857 If *text* contains ``**bold**`` markers, multiple ``<w:r>`` elements are
1858 generated — bold segments get ``<w:b/>`` injected into *base_rpr*.
1859 If no markdown is present, a single ``<w:r>`` is returned (identical
1860 to the previous behaviour).
1861 """
1862 segments = _parse_md_bold(text)
1863
1864 if len(segments) == 1 and not segments[0][1]:
1865
1866 sp = _space_attr(text)
1867 return f"<w:r>{base_rpr}<w:t{sp}>{_xml_escape_text(text)}</w:t></w:r>"
1868
1869 runs = []
1870 for seg_text, is_bold in segments:
1871 rpr = _modify_rpr_bold(base_rpr, is_bold)
1872 sp = _space_attr(seg_text)
1873 runs.append(
1874 f"<w:r>{rpr}<w:t{sp}>{_xml_escape_text(seg_text)}</w:t></w:r>"
1875 )
1876 return "".join(runs)
1877
1878def _tracked_change_ranges(xml_str: str) -> list:
1879 """Return ``[(start, end), ...]`` for ``<w:del>`` blocks only.
1880
1881 Text inside ``<w:ins>`` IS current document content and must remain
1882 matchable so that subsequent edits can modify previously-inserted text.
1883 Only ``<w:del>`` (already-deleted text) should be skipped during matching.
1884 """
1885 ranges = []
1886 for m in re.finditer(r"<w:del\b[^>]*>.*?</w:del>", xml_str, re.DOTALL):
1887 ranges.append((m.start(), m.end()))
1888 return ranges
1889
1890def _in_tracked_change(pos: int, ranges: list) -> bool:
1891 return any(s <= pos < e for s, e in ranges)
1892
1893def _is_inside_del_dom(node) -> bool:
1894 """Check whether a DOM *node* is inside a ``<w:del>`` ancestor."""
1895 parent = node.parentNode
1896 while parent:
1897 name = getattr(parent, "localName", None) or getattr(parent, "tagName", "")
1898 if name == "del" or name.endswith(":del"):
1899 return True
1900 parent = parent.parentNode
1901 return False
1902
1903def _is_inside_ins(pos: int, xml_str: str) -> bool:
1904 """Check whether *pos* falls inside a ``<w:ins>`` block."""
1905 for m in re.finditer(r"<w:ins\b[^>]*>.*?</w:ins>", xml_str, re.DOTALL):
1906 if m.start() <= pos < m.end():
1907 return True
1908 return False
1909
1910def _toc_paragraph_ranges(xml_str: str) -> list:
1911 """Return ``[(start, end), ...]`` for all Table of Contents regions.
1912
1913 Detects two OOXML patterns:
1914
1915 A. Structured Document Tags (``<w:sdt>``) containing
1916 ``<w:docPartGallery w:val="Table of Contents"/>``.
1917 The entire SDT block is marked as a skip zone.
1918
1919 B. Standalone paragraphs styled ``TOC1``, ``TOC2``, etc.
1920 (outside any SDT block already captured by pattern A).
1921
1922 These ranges are merged with ``<w:del>`` ranges in :func:`_skip_ranges`
1923 so that all five scanning passes in :func:`apply_text_edit` automatically
1924 skip TOC content. This prevents false matches when heading text appears
1925 in both the TOC and the document body.
1926 """
1927 ranges = []
1928 seen_spans = set()
1929
1930 for m in re.finditer(r"<w:sdt\b[^>]*>.*?</w:sdt>", xml_str, re.DOTALL):
1931 sdt_xml = m.group(0)
1932 if "docPartGallery" in sdt_xml and "Table of Contents" in sdt_xml:
1933 span = (m.start(), m.end())
1934 ranges.append(span)
1935 seen_spans.add(span)
1936
1937 for m in re.finditer(r"<w:p\b[^>]*>.*?</w:p>", xml_str, re.DOTALL):
1938 if not re.search(r'<w:pStyle[^>]*w:val="TOC\d+"', m.group(0)):
1939 continue
1940 span = (m.start(), m.end())
1941
1942 if any(s <= span[0] and span[1] <= e for s, e in ranges):
1943 continue
1944 ranges.append(span)
1945
1946 return ranges
1947
1948def _skip_ranges(xml_str: str) -> list:
1949 """Return merged skip zones: ``<w:del>`` blocks + TOC regions.
1950
1951 Combines :func:`_tracked_change_ranges` (deleted text) with
1952 :func:`_toc_paragraph_ranges` (auto-generated TOC) into a sorted,
1953 non-overlapping list of ``(start, end)`` ranges.
1954
1955 All five scanning passes in :func:`apply_text_edit` use this to avoid
1956 matching text in deleted blocks or TOC paragraphs.
1957 """
1958 raw = _tracked_change_ranges(xml_str) + _toc_paragraph_ranges(xml_str)
1959 if not raw:
1960 return []
1961
1962 raw.sort()
1963 merged = [raw[0]]
1964 for s, e in raw[1:]:
1965 if s <= merged[-1][1]:
1966 merged[-1] = (merged[-1][0], max(merged[-1][1], e))
1967 else:
1968 merged.append((s, e))
1969 return merged
1970
1971def _find_text_position(xml_str: str, text: str) -> int | None:
1972 """Find the byte position of *text* in ``<w:t>`` elements (for proximity)."""
1973 t_re = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
1974 text_lower = text.lower()
1975 for m in t_re.finditer(xml_str):
1976 if text_lower in _decode_xml_text(m.group(1)).lower():
1977 return m.start()
1978 return None
1979
1980def _resolve_candidates(xml_str: str, candidates: list, near_text: str, pos_key: str):
1981 """
1982 Pick one candidate from *candidates*.
1983
1984 - 1 candidate → return it directly.
1985 - >1 + *near_text* → pick the one closest to *near_text* in the XML.
1986 - >1 + no hint → return ``None`` (ambiguous).
1987 """
1988 if len(candidates) <= 1:
1989 return candidates[0] if candidates else None
1990 if not near_text:
1991 return None
1992 near_pos = _find_text_position(xml_str, near_text)
1993 if near_pos is None:
1994 return None
1995
1996 def _pos(c):
1997 p = c.get(pos_key)
1998 return p[0] if isinstance(p, tuple) else p
1999
2000 return min(candidates, key=lambda c: abs(_pos(c) - near_pos))
2001
2002def _ambiguity_error(old_text: str, count: int) -> dict:
2003 """Build error dict when *old_text* matches multiple locations."""
2004 return {
2005 "error": (
2006 f"Text '{old_text[:80]}' matches {count} locations in the document. "
2007 "Include more surrounding text in old_text to make it unique, "
2008 "or add near_text with text from a nearby heading or the same row."
2009 ),
2010 "suggestion": (
2011 f"Found {count} identical matches (likely same value in different table cells). "
2012 "Use near_text parameter with text from the same row to disambiguate. "
2013 "Example: near_text='Dubai governing law' to target that specific row."
2014 ),
2015 "match_count": count,
2016 }
2017
2018def _scan_for_match(xml_str, t_re, tc_ranges, old_text, mode):
2019 """
2020 Scan all ``<w:t>`` elements for *old_text* using the given *mode*.
2021
2022 Returns a **list** of candidate dicts (empty if no matches).
2023 """
2024 norm_old = _normalize_for_match(old_text) if mode == "quotes" else None
2025 lower_old = _normalize_for_match(old_text.lower()) if mode == "icase" else None
2026
2027 candidates = []
2028
2029 for m in t_re.finditer(xml_str):
2030
2031 if _in_tracked_change(m.start(), tc_ranges):
2032 continue
2033
2034 raw_content = m.group(2)
2035 decoded = _decode_xml_text(raw_content)
2036
2037 if mode == "exact":
2038 if old_text not in decoded:
2039 continue
2040 offset = decoded.index(old_text)
2041 actual_old = old_text
2042 elif mode == "quotes":
2043 norm_decoded = _normalize_for_match(decoded)
2044 if norm_old not in norm_decoded:
2045 continue
2046 offset = norm_decoded.index(norm_old)
2047 actual_old = decoded[offset : offset + len(old_text)]
2048 elif mode == "icase":
2049 norm_decoded = _normalize_for_match(decoded.lower())
2050 if lower_old not in norm_decoded:
2051 continue
2052 offset = norm_decoded.index(lower_old)
2053 actual_old = decoded[offset : offset + len(old_text)]
2054 else:
2055 continue
2056
2057 try:
2058 run_start, run_end = _find_enclosing_run(xml_str, m.start())
2059 except ValueError:
2060 continue
2061
2062 candidates.append({
2063 "actual_old": actual_old,
2064 "prefix": decoded[:offset],
2065 "suffix": decoded[offset + len(old_text) :],
2066 "run_span": (run_start, run_end),
2067 })
2068
2069 return candidates
2070
2071def _find_enclosing_run(xml_str: str, inner_pos: int) -> tuple:
2072 """
2073 Return ``(start, end)`` byte offsets of the ``<w:r>...</w:r>`` element
2074 enclosing *inner_pos*.
2075 """
2076 pos = inner_pos
2077 while pos >= 0:
2078 pos = xml_str.rfind("<w:r", 0, pos)
2079 if pos == -1:
2080 raise ValueError("No enclosing <w:r> found")
2081
2082 char_after = xml_str[pos + 4 : pos + 5]
2083 if char_after in (">", " ", "\n", "\r", "\t"):
2084
2085 if "</w:r>" not in xml_str[pos:inner_pos]:
2086 break
2087
2088 end = xml_str.find("</w:r>", inner_pos)
2089 if end == -1:
2090 raise ValueError("No closing </w:r> found")
2091 end += len("</w:r>")
2092 return pos, end
2093
2094def _extract_rpr_str(run_xml: str) -> str:
2095 """Extract ``<w:rPr>...</w:rPr>`` from a serialised ``<w:r>`` string."""
2096
2097 m = re.search(r"<w:rPr\b[^>]*/>|<w:rPr\b[^>]*>.*?</w:rPr>", run_xml, re.DOTALL)
2098 return m.group(0) if m else ""
2099
2100def _find_enclosing_paragraph(xml_str: str, inner_pos: int) -> tuple:
2101 """Return ``(start, end)`` of the ``<w:p>...</w:p>`` enclosing *inner_pos*."""
2102 pos = inner_pos
2103 while pos >= 0:
2104 pos = xml_str.rfind("<w:p", 0, pos)
2105 if pos == -1:
2106 raise ValueError("No enclosing <w:p> found")
2107 char_after = xml_str[pos + 4 : pos + 5]
2108 if char_after in (">", " ", "\n", "\r", "\t"):
2109
2110 if "</w:p>" not in xml_str[pos:inner_pos]:
2111 break
2112 end = xml_str.find("</w:p>", inner_pos)
2113 if end == -1:
2114 raise ValueError("No closing </w:p> found")
2115 end += len("</w:p>")
2116 return pos, end
2117
2118def _extract_ppr_str(para_xml: str) -> str:
2119 """Extract ``<w:pPr>...</w:pPr>`` from a serialised ``<w:p>`` string."""
2120 m = re.search(r"<w:pPr\b[^>]*/>|<w:pPr\b[^>]*>.*?</w:pPr>", para_xml, re.DOTALL)
2121 return m.group(0) if m else ""
2122
2123def _next_revision_id(xml_str: str) -> int:
2124 """Return the next available ``w:id`` value (max existing + 1)."""
2125 ids = [int(x) for x in re.findall(r'w:id="(\d+)"', xml_str)]
2126 return max(ids, default=1000) + 1
2127
2128def _nearby_paragraph_text(xml_str: str, search_text: str) -> str:
2129 """Find the 3 most similar paragraph texts using fuzzy matching.
2130
2131 Returns a pipe-separated string of ``[similarity%] text`` entries so the
2132 LLM can see what's actually in the document and self-correct.
2133 """
2134 from difflib import SequenceMatcher
2135
2136 p_re = re.compile(r"<w:p\b[^>]*>(.*?)</w:p>", re.DOTALL)
2137 t_re = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
2138 search_lower = search_text.lower()
2139 candidates: list = []
2140
2141 for pm in p_re.finditer(xml_str):
2142 p_text = " ".join(
2143 _decode_xml_text(tm.group(1)) for tm in t_re.finditer(pm.group(1))
2144 )
2145 p_text = p_text.strip()
2146 if not p_text:
2147 continue
2148 ratio = SequenceMatcher(None, search_lower, p_text.lower()).ratio()
2149 candidates.append((ratio, p_text[:200]))
2150
2151 candidates.sort(key=lambda x: -x[0])
2152 top = candidates[:3]
2153 if top and top[0][0] > 0.3:
2154 return " | ".join(f"[{r:.0%}] {t}" for r, t in top)
2155 return ""
2156
2157def _is_inside_del_str(container_xml: str, pos: int) -> bool:
2158 """Check if *pos* is inside a ``<w:del>`` block (string-based).
2159
2160 Uses ``"<w:del "`` (with trailing space) to avoid false positives from
2161 ``<w:delText>`` which also starts with ``<w:del``.
2162 """
2163 prefix = container_xml[:pos]
2164 return prefix.count("<w:del ") > prefix.count("</w:del>")
2165
2166def _extract_row_text(xml_str: str, pos: int) -> str:
2167 """Extract table row text as ``| cell1 | cell2 |`` around *pos*."""
2168 tr_start = xml_str.rfind("<w:tr", 0, pos)
2169 if tr_start == -1:
2170 return ""
2171 tr_end = xml_str.find("</w:tr>", pos)
2172 if tr_end == -1:
2173 return ""
2174 tr_xml = xml_str[tr_start : tr_end + len("</w:tr>")]
2175
2176 t_re = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
2177 tc_re = re.compile(r"<w:tc\b[^>]*>(.*?)</w:tc>", re.DOTALL)
2178 cells = []
2179 for tc_m in tc_re.finditer(tr_xml):
2180 tc_body = tc_m.group(1)
2181 cell_text = " ".join(
2182 _decode_xml_text(t.group(1))
2183 for t in t_re.finditer(tc_body)
2184 if not _is_inside_del_str(tc_body, t.start())
2185 ).strip()
2186 cells.append(cell_text)
2187 if cells:
2188 return "| " + " | ".join(cells) + " |"
2189 return ""
2190
2191def _extract_para_context(xml_str: str, pos: int) -> str:
2192 """Extract the edited paragraph (``>>>``) + preceding paragraph."""
2193 t_re = re.compile(r"<w:t[^>]*>([^<]*)</w:t>", re.DOTALL)
2194 p_re = re.compile(r"<w:p\b[^>]*>(.*?)</w:p>", re.DOTALL)
2195
2196 prev_text = ""
2197 for p_m in p_re.finditer(xml_str):
2198 p_body = p_m.group(1)
2199 p_text = " ".join(
2200 _decode_xml_text(t.group(1))
2201 for t in t_re.finditer(p_body)
2202 if not _is_inside_del_str(p_body, t.start())
2203 ).strip()
2204 if p_m.start() <= pos <= p_m.end():
2205 parts = []
2206 if prev_text:
2207 parts.append(f"...{prev_text[-80:]}")
2208 parts.append(f">>> {p_text[:200]}")
2209 return "\n".join(parts)
2210 if p_text:
2211 prev_text = p_text
2212 return ""
2213
2214def _context_around_revision(xml_str: str, revision_ids: list) -> str:
2215 """Extract surrounding text near the first tracked change for verification.
2216
2217 Returns pipe-delimited row text for table edits, or ``>>> paragraph``
2218 for body edits. Best-effort — returns ``""`` on any failure.
2219 """
2220 if not revision_ids:
2221 return ""
2222 first_id = str(revision_ids[0])
2223 for pat in (f'w:id="{first_id}"', f"w:id='{first_id}'"):
2224 edit_pos = xml_str.find(pat)
2225 if edit_pos != -1:
2226 break
2227 else:
2228 return ""
2229
2230 prefix = xml_str[:edit_pos]
2231 in_cell = prefix.count("<w:tc") > prefix.count("</w:tc>")
2232
2233 if in_cell:
2234 return _extract_row_text(xml_str, edit_pos)
2235 return _extract_para_context(xml_str, edit_pos)
2236
2237_MAX_UNCOMPRESSED_SIZE = 50 * 1024 * 1024
2238
2239def _read_zip_entry(blob: bytes, entry_path: str) -> bytes:
2240 """Read a single file from a ZIP blob with zip-bomb protection."""
2241 with zipfile.ZipFile(io.BytesIO(blob), "r") as zf:
2242 info = zf.getinfo(entry_path)
2243 if info.file_size > _MAX_UNCOMPRESSED_SIZE:
2244 raise ValueError(
2245 f"ZIP entry '{entry_path}' uncompressed size "
2246 f"({info.file_size:,} bytes) exceeds limit"
2247 )
2248
2249 if info.compress_size > 0 and info.file_size / info.compress_size > 100:
2250 raise ValueError(
2251 f"ZIP entry '{entry_path}' has suspicious compression ratio "
2252 f"({info.file_size / info.compress_size:.0f}:1)"
2253 )
2254 return zf.read(entry_path)
2255
2256def _condense_xml(xml_str: str) -> str:
2257 """
2258 Remove pretty-print whitespace between XML elements for compact storage.
2259 Preserves whitespace inside ``<w:t>`` and ``<w:delText>`` elements.
2260 """
2261 try:
2262 dom = safe_parseString(xml_str.encode("utf-8"))
2263
2264 _strip_whitespace_nodes(dom.documentElement)
2265 return dom.toxml(encoding="UTF-8").decode("utf-8")
2266 except Exception:
2267
2268 return xml_str
2269
2270def _strip_whitespace_nodes(element):
2271 """Remove whitespace-only text nodes, except inside w:t / w:delText."""
2272 TEXT_TAGS = {"t", "delText", "instrText", "delInstrText"}
2273
2274 for child in list(element.childNodes):
2275 if child.nodeType == child.ELEMENT_NODE:
2276 name = child.localName or child.tagName
2277 tag = name.split(":")[-1] if ":" in name else name
2278 if tag not in TEXT_TAGS:
2279 _strip_whitespace_nodes(child)
2280 elif child.nodeType == child.TEXT_NODE:
2281
2282 parent_name = element.localName or element.tagName
2283 parent_tag = parent_name.split(":")[-1] if ":" in parent_name else parent_name
2284 if parent_tag not in TEXT_TAGS:
2285 if child.nodeValue and child.nodeValue.strip() == "":
2286 element.removeChild(child)
2287
2288def _find_elements(root, tag: str) -> list:
2289 """Find all elements matching *tag* (namespace-agnostic)."""
2290 results = []
2291
2292 def traverse(node):
2293 if node.nodeType == node.ELEMENT_NODE:
2294 name = node.localName or node.tagName
2295 if name == tag or name.endswith(f":{tag}"):
2296 results.append(node)
2297 for child in node.childNodes:
2298 traverse(child)
2299
2300 traverse(root)
2301 return results
2302
2303def _get_child(parent, tag: str):
2304 """Get first direct child element matching *tag*."""
2305 for child in parent.childNodes:
2306 if child.nodeType == child.ELEMENT_NODE:
2307 name = child.localName or child.tagName
2308 if name == tag or name.endswith(f":{tag}"):
2309 return child
2310 return None
2311
2312def _get_children(parent, tag: str) -> list:
2313 """Get all direct child elements matching *tag*."""
2314 results = []
2315 for child in parent.childNodes:
2316 if child.nodeType == child.ELEMENT_NODE:
2317 name = child.localName or child.tagName
2318 if name == tag or name.endswith(f":{tag}"):
2319 results.append(child)
2320 return results
2321
2322def _get_text_content(element) -> str:
2323 """Get the text content of an element (direct text node children)."""
2324 parts = []
2325 for child in element.childNodes:
2326 if child.nodeType == child.TEXT_NODE:
2327 parts.append(child.nodeValue or "")
2328 return "".join(parts)
2329
2330def _remove_elements(root, tag: str):
2331 """Remove all elements matching *tag* from the tree."""
2332 for elem in _find_elements(root, tag):
2333 if elem.parentNode:
2334 elem.parentNode.removeChild(elem)
2335
2336def _strip_run_rsid_attrs(root):
2337 """Strip rsid* attributes from <w:r> and <w:rPr> elements (revision noise).
2338
2339 rsid attributes on <w:rPr> block run merging in _can_merge() because it
2340 compares rPr.toxml(). Stripping them before merging lets visually-identical
2341 runs merge correctly.
2342 """
2343 for run in _find_elements(root, "r"):
2344 for attr in list(run.attributes.values()):
2345 if "rsid" in attr.name.lower():
2346 run.removeAttribute(attr.name)
2347
2348 rpr = _get_child(run, "rPr")
2349 if rpr is not None:
2350 for attr in list(rpr.attributes.values()):
2351 if "rsid" in attr.name.lower():
2352 rpr.removeAttribute(attr.name)
2353
2354def _is_run(node) -> bool:
2355 name = node.localName or node.tagName
2356 return name == "r" or name.endswith(":r")
2357
2358def _is_adjacent(elem1, elem2) -> bool:
2359 """Check if two elements are adjacent (no elements between them)."""
2360 node = elem1.nextSibling
2361 while node:
2362 if node == elem2:
2363 return True
2364 if node.nodeType == node.ELEMENT_NODE:
2365 return False
2366 if node.nodeType == node.TEXT_NODE and node.data.strip():
2367 return False
2368 node = node.nextSibling
2369 return False
2370
2371def _can_merge(run1, run2) -> bool:
2372 """Check if two runs have identical formatting (rPr)."""
2373 rpr1 = _get_child(run1, "rPr")
2374 rpr2 = _get_child(run2, "rPr")
2375 if (rpr1 is None) != (rpr2 is None):
2376 return False
2377 if rpr1 is None:
2378 return True
2379 return rpr1.toxml() == rpr2.toxml()
2380
2381def _next_element_sibling(node):
2382 sibling = node.nextSibling
2383 while sibling:
2384 if sibling.nodeType == sibling.ELEMENT_NODE:
2385 return sibling
2386 sibling = sibling.nextSibling
2387 return None
2388
2389def _next_sibling_run(node):
2390 sibling = node.nextSibling
2391 while sibling:
2392 if sibling.nodeType == sibling.ELEMENT_NODE:
2393 if _is_run(sibling):
2394 return sibling
2395 sibling = sibling.nextSibling
2396 return None
2397
2398def _first_child_run(container):
2399 for child in container.childNodes:
2400 if child.nodeType == child.ELEMENT_NODE and _is_run(child):
2401 return child
2402 return None
2403
2404def _merge_run_content(target, source):
2405 """Move content from *source* run into *target* (skip rPr)."""
2406 for child in list(source.childNodes):
2407 if child.nodeType == child.ELEMENT_NODE:
2408 name = child.localName or child.tagName
2409 if name != "rPr" and not name.endswith(":rPr"):
2410 target.appendChild(child)
2411
2412def _consolidate_text(run):
2413 """Merge adjacent <w:t> elements inside a run into one."""
2414 t_elements = _get_children(run, "t")
2415
2416 for i in range(len(t_elements) - 1, 0, -1):
2417 curr, prev = t_elements[i], t_elements[i - 1]
2418
2419 if _is_adjacent(prev, curr):
2420 prev_text = prev.firstChild.data if prev.firstChild else ""
2421 curr_text = curr.firstChild.data if curr.firstChild else ""
2422 merged = prev_text + curr_text
2423
2424 if prev.firstChild:
2425 prev.firstChild.data = merged
2426 else:
2427 prev.appendChild(run.ownerDocument.createTextNode(merged))
2428
2429 if merged.startswith(" ") or merged.endswith(" "):
2430 prev.setAttribute("xml:space", "preserve")
2431 elif prev.hasAttribute("xml:space"):
2432 prev.removeAttribute("xml:space")
2433
2434 run.removeChild(curr)
2435
2436def _merge_runs_in(container) -> int:
2437 """Merge adjacent runs with identical formatting in a container."""
2438 merge_count = 0
2439 run = _first_child_run(container)
2440
2441 while run:
2442 while True:
2443 next_elem = _next_element_sibling(run)
2444 if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):
2445 _merge_run_content(run, next_elem)
2446 container.removeChild(next_elem)
2447 merge_count += 1
2448 else:
2449 break
2450
2451 _consolidate_text(run)
2452 run = _next_sibling_run(run)
2453
2454 return merge_count
2455