diff --git a/src/timeline-v1.ts b/src/timeline-v1.ts index 629523d0..85e5a8d7 100644 --- a/src/timeline-v1.ts +++ b/src/timeline-v1.ts @@ -86,6 +86,7 @@ export interface SearchResultRaw { result?: SearchResultRaw; }; legacy?: LegacyTweetRaw; + article?: ArticleRaw; } export interface TimelineResultRaw { @@ -119,6 +120,79 @@ export interface TimelineResultRaw { }; legacy?: LegacyTweetRaw; tweet?: TimelineResultRaw; + article?: ArticleRaw; +} + +export interface ArticleRaw { + article_results: { + result: ArticleResultRaw; + }; +} + +export interface ArticleResultRaw { + rest_id: string; + title: string; + cover_media?: ArticleCoverMediaRaw; + content_state: ArticleContentStateRaw; + media_entities?: ArticleMediaEntityRaw[]; +} + +export interface ArticleCoverMediaRaw { + media_key: string; + media_info: { + original_img_url: string; + }; +} + +export interface ArticleContentStateRaw { + blocks: ArticleBlockRaw[]; + entityMap: ArticleEntityRaw[]; +} + +export interface ArticleBlockRaw { + key: string; + text: string; + type: string; + inlineStyleRanges: { + offset: number; + length: number; + style: string; + }[]; + entityRanges: { + key: number; + offset: number; + length: number; + }[]; +} + +export interface ArticleEntityValueMediaItemRaw { + localMediaId: string; + mediaCategory: string; + mediaId: string; +} + +export interface ArticleEntityValueRaw { + type: string; + mutability?: string; + data: { + url?: string; + entityKey?: string; + mediaItems?: ArticleEntityValueMediaItemRaw[]; + }; +} + +export interface ArticleEntityRaw { + key: number; + value: ArticleEntityValueRaw; +} + +export interface ArticleMediaEntityRaw { + media_key: string; + media_id: string; + media_info: { + __typename: 'ApiImage' | 'ApiGif' | 'ApiVideo'; + original_img_url: string; + }; } export interface LegacyTweetRaw { diff --git a/src/timeline-v2.ts b/src/timeline-v2.ts index ecf12fe0..08fb8bbb 100644 --- a/src/timeline-v2.ts +++ b/src/timeline-v2.ts @@ -1,6 +1,8 @@ import { CoreUserRaw, LegacyUserRaw } from './profile'; import { parseMediaGroups, reconstructTweetHtml } from './timeline-tweet-util'; import { + ArticleEntityValueMediaItemRaw, + ArticleResultRaw, EditControlInitialRaw, LegacyTweetRaw, ParseTweetResult, @@ -8,7 +10,7 @@ import { SearchResultRaw, TimelineResultRaw, } from './timeline-v1'; -import { Tweet } from './tweets'; +import { Article, Tweet } from './tweets'; import { isFieldDefined } from './type-util'; export interface TimelineUserResultRaw { @@ -257,6 +259,139 @@ export function parseLegacyTweet( return { success: true, tweet: tw }; } +function parseArticleToMarkdown(article: Readonly): string { + const { blocks, entityMap } = article.content_state; + let markdown = `# ${article.title}\\n\\n`; + + for (const block of blocks) { + let text = block.text; + + const sortedEntityRanges = [...block.entityRanges].sort( + (a, b) => b.offset - a.offset, + ); // Reverse order to prevent messing up the offsets + for (const range of sortedEntityRanges) { + const entityWrapper = entityMap.find( + (e) => String(e.key) === String(range.key), + ); + if (!entityWrapper) continue; + const entity = entityWrapper.value; + + const chars = Array.from(text); + const originalText = chars + .slice(range.offset, range.offset + range.length) + .join(''); + let replacement = originalText; + + let textToWrap = originalText; + let trailingNewline = ''; + + if (textToWrap.endsWith('\n')) { + textToWrap = textToWrap.slice(0, -1); + trailingNewline = '\n'; + } + + if (entity.type === 'LINK' && entity.data.url) { + replacement = `[${textToWrap}](${entity.data.url})${trailingNewline}`; + } + + const prefix = chars.slice(0, range.offset).join(''); + const suffix = chars.slice(range.offset + range.length).join(''); + text = prefix + replacement + suffix; + } + + const sortedStyleRanges = [...block.inlineStyleRanges].sort( + (a, b) => b.offset - a.offset, + ); + for (const range of sortedStyleRanges) { + const chars = Array.from(text); + const originalText = chars + .slice(range.offset, range.offset + range.length) + .join(''); + let replacement = originalText; + + let textToWrap = originalText; + let trailingNewline = ''; + + if (textToWrap.endsWith('\n')) { + textToWrap = textToWrap.slice(0, -1); + trailingNewline = '\n'; + } + + if (range.style.toLowerCase() === 'bold') { + replacement = `**${textToWrap}**${trailingNewline}`; + } else if (range.style.toLowerCase() === 'italic') { + replacement = `*${textToWrap}*${trailingNewline}`; + } + + const prefix = chars.slice(0, range.offset).join(''); + const suffix = chars.slice(range.offset + range.length).join(''); + text = prefix + replacement + suffix; + } + + switch (block.type) { + case 'header-one': + markdown += `# ${text}\\n\\n`; + break; + case 'header-two': + markdown += `## ${text}\\n\\n`; + break; + case 'unordered-list-item': + markdown += `* ${text}\\n`; + break; + case 'atomic': + for (const range of block.entityRanges) { + const entityWrapper = entityMap.find( + (e) => String(e.key) === String(range.key), + ); + if (!entityWrapper) continue; + const entity = entityWrapper.value; + if (entity?.type === 'MEDIA' && entity.data.mediaItems) { + for (const mediaItem of entity.data.mediaItems) { + if (mediaItem?.mediaId) { + const mediaEntity = article.media_entities?.find( + (m) => m.media_id === mediaItem.mediaId, + ); + if (mediaEntity) { + markdown += `![image](${mediaEntity.media_info.original_img_url})\\n\\n`; + } + } + } + } + } + break; + case 'unstyled': + default: + markdown += `${text}\\n\\n`; + break; + } + } + + return markdown.trim(); +} + +function parseArticle(articleRaw: Readonly): Article { + const article: Article = { + id: articleRaw.rest_id, + title: articleRaw.title, + content_state: articleRaw.content_state, + }; + + if (articleRaw.cover_media) { + const coverMedia = articleRaw.media_entities?.find( + (m) => m.media_key === articleRaw.cover_media?.media_key, + ); + if (coverMedia) { + article.cover = { + id: coverMedia.media_id, + url: coverMedia.media_info.original_img_url, + alt_text: undefined, // not available + }; + } + } + + return article; +} + function parseResult(result?: TimelineResultRaw): ParseTweetResult { const noteTweetResultText = result?.note_tweet?.note_tweet_results?.result?.text; @@ -282,6 +417,15 @@ function parseResult(result?: TimelineResultRaw): ParseTweetResult { } } + const articleRaw = result?.article?.article_results?.result; + if (articleRaw) { + tweetResult.tweet.isArticle = true; + if (articleRaw.content_state) { + tweetResult.tweet.article = parseArticle(articleRaw); + tweetResult.tweet.text = parseArticleToMarkdown(articleRaw); + } + } + const quotedResult = result?.quoted_status_result?.result; if (quotedResult) { if (quotedResult.legacy && quotedResult.rest_id) { diff --git a/src/tweets.test.ts b/src/tweets.test.ts index b8e638ab..8ec3cf17 100644 --- a/src/tweets.test.ts +++ b/src/tweets.test.ts @@ -378,3 +378,33 @@ test('scraper can get animated image as video', async () => { url: expectedURL, }); }); + +test('scraper marks article tweets and exposes article metadata', async () => { + // X Article tweet — `legacy.full_text` is just the t.co URL stub; the + // body lives in `article.article_results.result`. Without article + // parsing the lib used to return `text` as the bare URL and lose the + // body entirely. + const scraper = await getScraper(); + const tweet = await scraper.getTweet('2053808119709659225'); + + expect(tweet).not.toBeNull(); + expect(tweet?.isArticle).toBe(true); + expect(tweet?.article).toBeDefined(); + expect(tweet?.article?.id).toBe('2051886859186532352'); + expect(tweet?.article?.title).toContain('Research Layer'); + // content_state is the source-of-truth payload we render markdown from. + expect(tweet?.article?.content_state?.blocks?.length ?? 0).toBeGreaterThan( + 10, + ); +}); + +test('scraper renders article body into tweet.text as markdown', async () => { + const scraper = await getScraper(); + const tweet = await scraper.getTweet('2053808119709659225'); + + expect(tweet?.text).toBeDefined(); + // Far longer than the bare-URL stub the lib used to return (~23 chars) + // and the title is rendered as an H1 at the top. + expect((tweet?.text ?? '').length).toBeGreaterThan(1000); + expect(tweet?.text).toMatch(/^# /); +}); diff --git a/src/tweets.ts b/src/tweets.ts index c427d146..e0131894 100644 --- a/src/tweets.ts +++ b/src/tweets.ts @@ -1,7 +1,11 @@ import { addApiFeatures, requestApi, bearerToken2 } from './api'; import { TwitterAuth } from './auth'; import { getUserIdByScreenName } from './profile'; -import { LegacyTweetRaw, QueryTweetsResponse } from './timeline-v1'; +import { + ArticleContentStateRaw, + LegacyTweetRaw, + QueryTweetsResponse, +} from './timeline-v1'; import { parseTimelineTweetsV2, TimelineV2, @@ -33,6 +37,13 @@ export interface Video { url?: string; } +export interface Article { + id: string; + title: string; + cover?: Photo; + content_state: ArticleContentStateRaw; +} + export interface PlaceRaw { id?: string; place_type?: string; @@ -65,6 +76,8 @@ export interface Tweet { isReply?: boolean; isRetweet?: boolean; isSelfThread?: boolean; + isArticle?: boolean; + article?: Article; likes?: number; name?: string; mentions: Mention[];