diff --git a/.changeset/fix-html-body-content-id.md b/.changeset/fix-html-body-content-id.md new file mode 100644 index 00000000..54ef2c08 --- /dev/null +++ b/.changeset/fix-html-body-content-id.md @@ -0,0 +1,11 @@ +--- +"@googleworkspace/cli": patch +--- + +fix(gmail): preserve HTML body when text/html part has Content-ID + +Outlook/Exchange adds a Content-ID header to the text/html body part for +multipart/related referencing. The MIME walker incorrectly treated any part +with Content-ID as a non-body part, causing the HTML body to be silently +dropped. Replies to Outlook messages fell back to a plain-text conversion, +losing all formatting, nested blockquotes, and inline images. diff --git a/crates/google-workspace-cli/src/helpers/gmail/mod.rs b/crates/google-workspace-cli/src/helpers/gmail/mod.rs index caeb8b6b..9aef2ba7 100644 --- a/crates/google-workspace-cli/src/helpers/gmail/mod.rs +++ b/crates/google-workspace-cli/src/helpers/gmail/mod.rs @@ -919,9 +919,15 @@ fn extract_payload_recursive( // Primary signal: does this part have fetchable binary data? let is_hydratable = !attachment_id.is_empty(); - // A body text part has inline body.data, no attachmentId, no filename, and no Content-ID. + // A body text part is text/plain or text/html with inline body.data, no + // attachmentId, and no filename. + // Note: Content-ID is NOT checked here. Outlook/Exchange adds Content-ID to + // the text/html body part for multipart/related referencing — excluding parts + // with Content-ID would cause the HTML body to be silently dropped, falling + // back to a plain-text conversion that loses all formatting and nested quotes. + let is_text_mime = mime_type == "text/plain" || mime_type == "text/html"; let is_body_text_part = - !is_hydratable && filename.is_empty() && content_id_header.is_none() && body_data.is_some(); + !is_hydratable && filename.is_empty() && is_text_mime && body_data.is_some(); if is_body_text_part { // body_data is guaranteed Some by the is_body_text_part check above. @@ -985,6 +991,28 @@ fn extract_payload_recursive( for child in child_parts { extract_payload_recursive(child, contents, part_counter); } + } else if body_data.is_some() && !mime_type.starts_with("multipart/") { + // Non-body-text, non-hydratable leaf with inline data that we're about + // to drop. Log so silent loss is at least debuggable. + let mime_label = if mime_type.is_empty() { + "" + } else { + mime_type + }; + eprintln!( + "Warning: skipping inline {} part ({}{})", + sanitize_for_terminal(mime_label), + if filename.is_empty() { + "no filename".to_string() + } else { + format!("filename: {}", sanitize_for_terminal(filename)) + }, + if body_size > 0 { + format!(", {} bytes", body_size) + } else { + String::new() + }, + ); } } } @@ -3738,6 +3766,36 @@ mod tests { ); } + #[test] + fn test_extract_payload_contents_plain_text_with_content_id() { + // Some mail generators add Content-ID to text/plain parts (seen in + // forwarded messages and certain enterprise gateways). The walker must + // still recognize these as body text. + let text_data = base64url("Plain text body with Content-ID"); + let payload = json!({ + "mimeType": "multipart/related", + "parts": [ + { + "mimeType": "text/plain", + "body": { "data": text_data, "size": 31 }, + "headers": [ + { "name": "Content-Type", "value": "text/plain; charset=\"utf-8\"" }, + { "name": "Content-ID", "value": "" } + ] + } + ] + }); + let contents = extract_payload_contents(&payload); + assert!( + contents.body_text.is_some(), + "text/plain with Content-ID must not be skipped" + ); + assert_eq!( + contents.body_text.as_deref(), + Some("Plain text body with Content-ID") + ); + } + #[test] fn test_header_case_insensitive() { let payload = json!({ @@ -3902,6 +3960,126 @@ mod tests { ); } + #[test] + fn test_parse_original_message_html_with_content_id_end_to_end() { + // End-to-end regression test: Outlook/Exchange adds Content-ID to text/html + // body parts for multipart/related referencing. The HTML must survive through + // parse_original_message and resolve_html_body, not fall back to
-ified + // plain text. + let plain_data = base64url("Plain text version"); + let html_data = base64url( + "

Rich HTML

Nested quote
", + ); + let msg = json!({ + "threadId": "thread1", + "snippet": "Rich HTML", + "payload": { + "mimeType": "multipart/related", + "headers": [ + { "name": "From", "value": "sender@example.com" }, + { "name": "To", "value": "recipient@example.com" }, + { "name": "Subject", "value": "Re: Meeting followup" }, + { "name": "Message-ID", "value": "" }, + ], + "parts": [ + { + "mimeType": "multipart/alternative", + "parts": [ + { + "mimeType": "text/plain", + "body": { "data": plain_data, "size": 18 }, + "headers": [ + { "name": "Content-Type", "value": "text/plain; charset=\"utf-8\"" } + ] + }, + { + "mimeType": "text/html", + "body": { "data": html_data, "size": 78 }, + "headers": [ + { "name": "Content-Type", "value": "text/html; charset=\"utf-8\"" }, + { "name": "Content-ID", "value": "" } + ] + } + ] + }, + { + "mimeType": "image/png", + "filename": "", + "body": { "attachmentId": "SIG_IMG", "size": 500 }, + "headers": [ + { "name": "Content-ID", "value": "" } + ] + } + ] + } + }); + let original = parse_original_message(&msg).unwrap(); + // body_html must be the actual HTML, not None + assert!( + original.body_html.is_some(), + "HTML body with Content-ID must be preserved through parse_original_message" + ); + // resolve_html_body must return the HTML, not a
-converted plain text fallback + let resolved = resolve_html_body(&original); + assert!( + resolved.contains("blockquote"), + "resolve_html_body must use the HTML body, not plain-text fallback" + ); + assert!( + !resolved.contains("
"), + "resolve_html_body must not fall back to
-converted plain text" + ); + // Inline signature image must still be collected as a part + assert_eq!(original.parts.len(), 1); + assert_eq!(original.parts[0].attachment_id, "SIG_IMG"); + } + + #[test] + fn test_extract_payload_contents_multiple_html_leaves_first_wins() { + // When multiple text/html parts are eligible (e.g. one with Content-ID, + // one without), the walker takes the first one encountered in DFS order. + // This documents current behavior — it is a heuristic, not a guarantee + // of multipart/related root-part semantics (which would require honoring + // the start= parameter). + let first_html = base64url("

First HTML (with Content-ID)

"); + let second_html = base64url("

Second HTML (no Content-ID)

"); + let payload = json!({ + "mimeType": "multipart/related", + "parts": [ + { + "mimeType": "multipart/alternative", + "parts": [ + { + "mimeType": "text/html", + "body": { "data": first_html, "size": 34 }, + "headers": [ + { "name": "Content-ID", "value": "" } + ] + }, + { + "mimeType": "text/html", + "body": { "data": second_html, "size": 37 }, + "headers": [] + } + ] + } + ] + }); + let contents = extract_payload_contents(&payload); + assert!( + contents.body_html.is_some(), + "at least one text/html part should be recognized as body" + ); + assert!( + contents + .body_html + .as_deref() + .unwrap() + .contains("First HTML"), + "First eligible text/html in DFS order should win" + ); + } + // --- finalize_message with multiple inline images --- #[test]