Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/release-notes/eclair-vnext.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

## Major changes

### Alternate strategy to avoid mass force-close of channels in certain cases

The default strategy, when an unhandled exception or internal error happens, is to locally force-close the channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.

This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node (with the default behavior, it would "only" cause a local force-close of the channel).

### Separate log for important notifications

Eclair added a new log file (`notifications.log`) for important notifications that require an action from the node operator.
Expand Down
10 changes: 10 additions & 0 deletions eclair-core/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@ eclair {
max-block-processing-delay = 30 seconds // we add a random delay before processing blocks, capped at this value, to prevent herd effect
max-tx-publish-retry-delay = 60 seconds // we add a random delay before retrying failed transaction publication

// The default strategy, when we encounter an unhandled exception or internal error, is to locally force-close the
// channel. Not only is there a delay before the channel balance gets refunded, but if the exception was due to some
// misconfiguration or bug in eclair that affects all channels, we risk force-closing all channels.
// This is why an alternative behavior is to simply log an error and stop the node. Note that if you don't closely
// monitor your node, there is a risk that your peers take advantage of the downtime to try and cheat by publishing a
// revoked commitment. Additionally, while there is no known way of triggering an internal error in eclair from the
// outside, there may very well be a bug that allows just that, which could be used as a way to remotely stop the node
// (with the default behavior, it would "only" cause a local force-close of the channel).
unhandled-exception-strategy = "local-close" // local-close or stop

relay {
fees {
// Fees for public channels
Expand Down
8 changes: 8 additions & 0 deletions eclair-core/src/main/scala/fr/acinq/eclair/NodeParams.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Crypto, Satoshi}
import fr.acinq.eclair.Setup.Seeds
import fr.acinq.eclair.blockchain.fee._
import fr.acinq.eclair.channel.Channel
import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
import fr.acinq.eclair.crypto.Noise.KeyPair
import fr.acinq.eclair.crypto.keymanager.{ChannelKeyManager, NodeKeyManager}
import fr.acinq.eclair.db._
Expand Down Expand Up @@ -76,6 +77,7 @@ case class NodeParams(nodeKeyManager: NodeKeyManager,
relayParams: RelayParams,
reserveToFundingRatio: Double,
maxReserveToFundingRatio: Double,
unhandledExceptionStrategy: UnhandledExceptionStrategy,
db: Databases,
revocationTimeout: FiniteDuration,
autoReconnect: Boolean,
Expand Down Expand Up @@ -357,6 +359,11 @@ object NodeParams extends Logging {
PathFindingExperimentConf(experiments.toMap)
}

val unhandledExceptionStrategy = config.getString("unhandled-exception-strategy") match {
case "local-close" => UnhandledExceptionStrategy.LocalClose
case "stop" => UnhandledExceptionStrategy.Stop
}

val routerSyncEncodingType = config.getString("router.sync.encoding-type") match {
case "uncompressed" => EncodingType.UNCOMPRESSED
case "zlib" => EncodingType.COMPRESSED_ZLIB
Expand Down Expand Up @@ -423,6 +430,7 @@ object NodeParams extends Logging {
),
reserveToFundingRatio = config.getDouble("reserve-to-funding-ratio"),
maxReserveToFundingRatio = config.getDouble("max-reserve-to-funding-ratio"),
unhandledExceptionStrategy = unhandledExceptionStrategy,
db = database,
revocationTimeout = FiniteDuration(config.getDuration("revocation-timeout").getSeconds, TimeUnit.SECONDS),
autoReconnect = config.getBoolean("auto-reconnect"),
Expand Down
41 changes: 39 additions & 2 deletions eclair-core/src/main/scala/fr/acinq/eclair/channel/Channel.scala
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ object Channel {
/** We don't immediately process [[CurrentBlockCount]] to avoid herd effects */
case class ProcessCurrentBlockCount(c: CurrentBlockCount)

// @formatter:off
/** What do we do if we have a local unhandled exception. */
sealed trait UnhandledExceptionStrategy
object UnhandledExceptionStrategy {
/** Ask our counterparty to close the channel. This may be the best choice for smaller loosely administered nodes.*/
case object LocalClose extends UnhandledExceptionStrategy
/** Just log an error and stop the node. May be better for larger nodes, to prevent unwanted mass force-close.*/
case object Stop extends UnhandledExceptionStrategy
}
// @formatter:on

}

class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remoteNodeId: PublicKey, blockchain: typed.ActorRef[ZmqWatcher.Command], relayer: ActorRef, txPublisherFactory: Channel.TxPublisherFactory, origin_opt: Option[ActorRef] = None)(implicit ec: ExecutionContext = ExecutionContext.Implicits.global) extends FSM[ChannelState, ChannelData] with FSMDiagnosticActorLogging[ChannelState, ChannelData] {
Expand Down Expand Up @@ -1669,6 +1680,7 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
case syncSuccess: SyncResult.Success =>
var sendQueue = Queue.empty[LightningMessage]
// normal case, our data is up-to-date

if (channelReestablish.nextLocalCommitmentNumber == 1 && d.commitments.localCommit.index == 0) {
// If next_local_commitment_number is 1 in both the channel_reestablish it sent and received, then the node MUST retransmit funding_locked, otherwise it MUST NOT
log.debug("re-sending fundingLocked")
Expand Down Expand Up @@ -2288,7 +2300,8 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo

private def handleLocalError(cause: Throwable, d: ChannelData, msg: Option[Any]) = {
cause match {
case _: ForcedLocalCommit => log.warning(s"force-closing channel at user request")
case _: ForcedLocalCommit =>
log.warning(s"force-closing channel at user request")
case _ if msg.exists(_.isInstanceOf[OpenChannel]) || msg.exists(_.isInstanceOf[AcceptChannel]) =>
// invalid remote channel parameters are logged as warning
log.warning(s"${cause.getMessage} while processing msg=${msg.getOrElse("n/a").getClass.getSimpleName} in state=$stateName")
Expand All @@ -2308,7 +2321,31 @@ class Channel(val nodeParams: NodeParams, val wallet: OnChainChannelFunder, remo
log.info(s"we have a valid closing tx, publishing it instead of our commitment: closingTxId=${bestUnpublishedClosingTx.tx.txid}")
// if we were in the process of closing and already received a closing sig from the counterparty, it's always better to use that
handleMutualClose(bestUnpublishedClosingTx, Left(negotiating))
case dd: HasCommitments => spendLocalCurrent(dd) sending error // otherwise we use our current commitment
case dd: HasCommitments =>
cause match {
case _: ChannelException =>
// known channel exception: we force close using our current commitment
spendLocalCurrent(dd) sending error
case _ =>
// unhandled exception: we apply the configured strategy
nodeParams.unhandledExceptionStrategy match {
case UnhandledExceptionStrategy.LocalClose =>
spendLocalCurrent(dd) sending error
case UnhandledExceptionStrategy.Stop =>
log.error("unhandled exception: standard procedure would be to force-close the channel, but eclair has been configured to halt instead.")
NotificationsLogger.logFatalError(
s"""stopping node as configured strategy to unhandled exceptions for nodeId=$remoteNodeId channelId=${d.channelId}
|
|Eclair has been configured to shut down when an unhandled exception happens, instead of requesting a
|force-close from the peer. This gives the operator a chance of avoiding an unnecessary mass force-close
|of channels that may be caused by a bug in Eclair, or issues like running out of disk space, etc.
|
|You should get in touch with Eclair developers and provide logs of your node for analysis.
|""".stripMargin, cause)
System.exit(1)
stop(FSM.Shutdown)
}
}
case _ => goto(CLOSED) sending error // when there is no commitment yet, we just send an error to our peer and go to CLOSED state
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ object Helpers {
case class LocalLateUnproven(ourRemoteCommitmentNumber: Long, theirLocalCommitmentNumber: Long) extends Failure
case class RemoteLying(ourLocalCommitmentNumber: Long, theirRemoteCommitmentNumber: Long, invalidPerCommitmentSecret: PrivateKey) extends Failure
case object RemoteLate extends Failure
}
}
// @formatter:on

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import fr.acinq.bitcoin.{Block, ByteVector32, Satoshi, SatoshiLong, Script}
import fr.acinq.eclair.FeatureSupport.{Mandatory, Optional}
import fr.acinq.eclair.Features._
import fr.acinq.eclair.blockchain.fee._
import fr.acinq.eclair.channel.Channel.UnhandledExceptionStrategy
import fr.acinq.eclair.channel.LocalParams
import fr.acinq.eclair.crypto.keymanager.{LocalChannelKeyManager, LocalNodeKeyManager}
import fr.acinq.eclair.io.{Peer, PeerConnection}
Expand Down Expand Up @@ -143,6 +144,7 @@ object TestConstants {
feeProportionalMillionths = 30)),
reserveToFundingRatio = 0.01, // note: not used (overridden below)
maxReserveToFundingRatio = 0.05,
unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
db = TestDatabases.inMemoryDb(),
revocationTimeout = 20 seconds,
autoReconnect = false,
Expand Down Expand Up @@ -269,6 +271,7 @@ object TestConstants {
feeProportionalMillionths = 30)),
reserveToFundingRatio = 0.01, // note: not used (overridden below)
maxReserveToFundingRatio = 0.05,
unhandledExceptionStrategy = UnhandledExceptionStrategy.LocalClose,
db = TestDatabases.inMemoryDb(),
revocationTimeout = 20 seconds,
autoReconnect = false,
Expand Down