Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,22 @@ async def test_issue_42():
assert doc.rawContent.endswith(doc.retweetedTweet.rawContent)


def test_retweet_not_duplicated():
"""The original tweet embedded inside a retweet must not also be yielded
as a standalone top-level item by parse_tweets."""
raw = fake_rep("_issue_42").json()
tweets = list(parse_tweets(raw))

rt_wrapper = next((t for t in tweets if t.id == 1665951747842641921), None)
assert rt_wrapper is not None, "RT wrapper tweet not found"
assert rt_wrapper.retweetedTweet is not None

original_id = rt_wrapper.retweetedTweet.id
assert all(t.id != original_id for t in tweets), (
f"retweetedTweet {original_id} leaked as a standalone top-level item"
)


async def test_issue_56():
raw = fake_rep("_issue_56").json()
doc = parse_tweet(raw, 1682072224013099008)
Expand All @@ -552,6 +568,24 @@ async def test_issue_56():
assert len(doc.links) == 5


async def test_issue_310():
api = get_api()
mock_rep(api.user_tweets_raw, "raw_user_tweets", as_generator=True)

tweets = await gather(api.user_tweets(2244994945))
top_level_ids = {x.id for x in tweets}
retweeted_ids = {x.retweetedTweet.id for x in tweets if x.retweetedTweet is not None}
leaked_ids = top_level_ids & retweeted_ids

assert retweeted_ids
assert not leaked_ids, (
f"top_level={len(top_level_ids)}, "
f"retweets={sum(x.retweetedTweet is not None for x in tweets)}, "
f"retweeted_children={len(retweeted_ids)}, "
f"leaked={len(leaked_ids)}"
)


async def test_cards():
# Issues:
# - https://github.com/vladkens/twscrape/issues/72
Expand Down
3 changes: 3 additions & 0 deletions twscrape/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,9 +822,12 @@ def _parse_items(rep: Response, kind: str, limit: int = -1):
# check for dict, because Response can be mocked in tests with different type
res = rep if isinstance(rep, dict) else rep.json()
obj = to_old_rep(res)
retweeted_ids: set[str] = obj.get("retweeted_ids", set())

ids = set()
for x in obj[key].values():
if kind == "tweet" and x.get("id_str") in retweeted_ids:
continue
if limit != -1 and len(ids) >= limit:
# todo: move somewhere in configuration like force_limit
# https://github.com/vladkens/twscrape/issues/26#issuecomment-1656875132
Expand Down
16 changes: 14 additions & 2 deletions twscrape/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def to_old_obj(obj: dict):
return _flatten_tweet_v2(obj)


def to_old_rep(obj: dict) -> dict[str, dict]:
def to_old_rep(obj: dict) -> dict[str, Any]:
tmp = get_typed_object(obj, defaultdict(list))

# "legacy" in x still matches under the new schema: the key is present
Expand Down Expand Up @@ -291,7 +291,19 @@ def _to_old_user(obj: dict) -> dict | None:
trends = list(tmp.get("TimelineTrend", []))
trends = {x["name"]: x for x in trends}

return {"tweets": {**tw1, **tw2}, "users": users, "trends": trends}
tweets = {**tw1, **tw2}
retweeted_ids = {
str(retweeted_id)
for tweet in tweets.values()
for path in (
"retweeted_status_id_str",
"retweeted_status_result.result.rest_id",
"retweeted_status_result.result.tweet.rest_id",
)
if (retweeted_id := get_or(tweet, path)) is not None
}

return {"tweets": tweets, "retweeted_ids": retweeted_ids, "users": users, "trends": trends}


def print_table(rows: list[dict], hr_after=False):
Expand Down