Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ dependencies = [
"beautifulsoup4>=4.13.0",
]

[project.optional-dependencies]
curl = ["curl-cffi>=0.7.0"]

[dependency-groups]
dev = [
"curl-cffi>=0.7.0",
"httpx>=0.26.0",
"pyright>=1.1.369",
"pytest-asyncio>=0.23.3",
"pytest-cov>=4.1.0",
"pytest-httpx>=0.28.0",
"pytest>=7.4.4",
"ruff>=0.1.11",
]
Expand Down Expand Up @@ -66,5 +70,5 @@ select = ["E", "F", "I", "UP", "C4", "SIM"]
ignore = ["E501", "UP035", "SIM105"]

[tool.pyright]
include = ["apps", "clients", "strategy", "lib", "scripts"]
include = ["twscrape"]
typeCheckingMode = "standard"
13 changes: 9 additions & 4 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ Twitter GraphQL API implementation with [SNScrape](https://github.com/JustAnothe
```bash
pip install twscrape
```
Or development version:

`httpx` is included by default. For better Cloudflare/bot-detection bypass, install `curl-cffi` as well — it uses libcurl with browser-level TLS fingerprint spoofing and is preferred automatically when present:

```bash
pip install git+https://github.com/vladkens/twscrape.git
pip install twscrape[curl]
```

Override the backend explicitly with `TWS_HTTP_BACKEND=httpx` or `TWS_HTTP_BACKEND=curl`.

## Features
- Support both Search & GraphQL Twitter API
- Async/Await functions (can run multiple scrapers in parallel at the same time)
Expand Down Expand Up @@ -122,9 +126,9 @@ async def main():
async for tweet in api.search("elon musk"):
print(tweet.id, tweet.user.username, tweet.rawContent) # tweet is `Tweet` object

# NOTE 2: all methods have `raw` version (returns `httpx.Response` object):
# NOTE 2: all methods have `raw` version (returns `twscrape.Response` object):
async for rep in api.search_raw("elon musk"):
print(rep.status_code, rep.json()) # rep is `httpx.Response` object
print(rep.status_code, rep.json()) # rep is `twscrape.Response` object

# change log level, default info
set_log_level("DEBUG")
Expand Down Expand Up @@ -360,6 +364,7 @@ _Note:_ If proxy not working, exception will be raised from API class.
- `TWS_PROXY` - global proxy for all accounts (e.g. `socks5://user:pass@127.0.0.1:1080`)
- `TWS_WAIT_EMAIL_CODE` - timeout for email verification code during login (default: `30`, in seconds)
- `TWS_RAISE_WHEN_NO_ACCOUNT` - raise `NoAccountError` exception when no available accounts, instead of waiting (default: `false`, values: `false`/`0`/`true`/`1`)
- `TWS_HTTP_BACKEND` - force HTTP backend: `httpx` or `curl` (default: `curl` if installed, otherwise `httpx`)

## Limitations

Expand Down
10 changes: 5 additions & 5 deletions scripts/update_gql_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
import os
import re
import sys
from typing import Any

import httpx

from twscrape.http import make_client
from twscrape.xclid import get_scripts_list, get_tw_page_text, script_url

API_FILE = "twscrape/api.py"
Expand All @@ -28,7 +28,7 @@ def _is_relevant_script(url: str) -> bool:
async def get_scripts() -> list[tuple[str, str]]:
os.makedirs(CACHE_DIR, exist_ok=True)

async with httpx.AsyncClient(follow_redirects=True) as clt:
async with make_client() as clt:
text = await get_tw_page_text("https://x.com/elonmusk", clt)

urls = list(get_scripts_list(text))
Expand All @@ -50,7 +50,7 @@ async def fetch_scripts(scripts: list[tuple[str, str]], force: bool) -> None:
print(f"Downloading {len(todo)} scripts.")
sem = asyncio.Semaphore(10)

async def fetch(clt: httpx.AsyncClient, i: int, url: str, path: str) -> None:
async def fetch(clt: Any, i: int, url: str, path: str) -> None:
async with sem:
print(f" ({i:3d}/{len(todo):3d}) {url}")
rep = await clt.get(url)
Expand All @@ -61,7 +61,7 @@ async def fetch(clt: httpx.AsyncClient, i: int, url: str, path: str) -> None:
with open(path, "w", encoding="utf-8") as fp:
fp.write(rep.text)

async with httpx.AsyncClient(follow_redirects=True) as clt:
async with make_client() as clt:
await asyncio.gather(*[fetch(clt, i, url, path) for i, (url, path) in enumerate(todo, 1)])


Expand Down
11 changes: 8 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import pytest

from twscrape.account import Account
from twscrape.accounts_pool import AccountsPool
from twscrape.api import API
from twscrape.logger import set_log_level
from twscrape.queue_client import QueueClient, XClIdGenStore

set_log_level("ERROR")
from .mock_http import MockClient

set_log_level("CRITICAL")


class ClIdGenMock:
Expand All @@ -28,15 +31,17 @@ def pool_mock(tmp_path):


@pytest.fixture
async def client_fixture(pool_mock: AccountsPool):
async def client_fixture(pool_mock: AccountsPool, monkeypatch):
mock_clt = MockClient()
monkeypatch.setattr(Account, "make_client", lambda self, proxy=None: mock_clt)
pool_mock._order_by = "username"

for x in range(1, 3):
await pool_mock.add_account(f"user{x}", f"pass{x}", f"email{x}", f"email_pass{x}")
await pool_mock.set_active(f"user{x}", True)

client = QueueClient(pool_mock, "SearchTimeline")
yield pool_mock, client
yield pool_mock, client, mock_clt


@pytest.fixture
Expand Down
77 changes: 77 additions & 0 deletions tests/mock_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import json as _json
from unittest.mock import MagicMock

from twscrape.http import HttpClient, HttpMethod, Response


def _raw(*, status_code: int = 200, json_data=None, text: str = "", headers: dict | None = None):
raw = MagicMock()
raw.status_code = status_code
raw.text = text
raw.content = text.encode()
raw.headers = headers or {}
raw.url = "https://mock.local"
raw.request = MagicMock()
raw.request.method = "GET"
raw.request.url = "https://mock.local"
raw.json.return_value = json_data if json_data is not None else {}
if status_code >= 400:
raw.raise_for_status.side_effect = Exception(f"HTTP {status_code}")
else:
raw.raise_for_status.return_value = None
return raw


class MockClient(HttpClient):
def __init__(self):
self._queue: list = []
self._cookies: dict = {}
self._headers: dict = {}

def add_response(
self,
*,
status_code: int = 200,
json: dict | list | None = None,
text: str = "",
headers: dict | None = None,
) -> "MockClient":
self._queue.append(("response", status_code, json, text, headers))
return self

def add_exception(self, exc: Exception) -> "MockClient":
self._queue.append(("exc", exc))
return self

def add_invalid_json_response(
self, *, status_code: int = 200, text: str = "not-json", headers: dict | None = None
) -> "MockClient":
self._queue.append(("invalid_json", status_code, text, headers))
return self

@property
def cookies(self):
return self._cookies

@property
def headers(self):
return self._headers

async def request(self, method: HttpMethod, url: str, **kwargs) -> Response:
if not self._queue:
raise RuntimeError("MockClient: no more queued responses")
item = self._queue.pop(0)
if item[0] == "exc":
raise item[1]
if item[0] == "invalid_json":
_, status_code, text, headers = item
raw = _raw(status_code=status_code, text=text, headers=headers)
raw.json.side_effect = _json.JSONDecodeError("no json", "", 0)
return Response(raw)
_, status_code, json_data, text, headers = item
return Response(
_raw(status_code=status_code, json_data=json_data, text=text, headers=headers)
)

async def aclose(self) -> None:
pass
Loading