1 2 Previous Next 16 Replies Latest reply on Nov 10, 2011 11:03 AM by sannegrinovero

Replication Timeout with Hibernate Search - Infinispan

pbratton Nov 4, 2011 11:30 AM

I'm working with a 10-node Infinispan cluster used as a Hibernate Search backend. Our servers are running TC server 2.5 (tomcat 6.0.32) on Java 1.6_24. We are using jGroups 2.12.1.3 for handling cluster cache writes from each node, and for multicast UDP transport.

When we launch 3+ nodes in our cluster, eventually one of the nodes begins to log replication timeouts. We've observed the same result whether we configure Infinispan for replication or for distribution cache modes. Although the rest of the cluster remains stable, the failing node becomes essentially unsuable for search.

Our configuration:

Infinispan:

<?xml version="1.0" encoding="UTF-8"?>

<infinispan

xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="urn:infinispan:config:5.0 http://www.infinispan.org/schemas/infinispan-config-5.0.xsd"

xmlns="urn:infinispan:config:5.0">

<globalJmxStatistics

enabled="true"

cacheManagerName="HibernateSearch"

allowDuplicateDomains="true" />

<transport

clusterName="HibernateSearch-Infinispan-cluster-MT"

distributedSyncTimeout="50000">

</properties>

</transport>

<shutdown

hookBehavior="DONT_REGISTER" />

</global>

<locking

lockAcquisitionTimeout="20000"

writeSkewCheck="false"

concurrencyLevel="5000"

useLockStriping="false" />

<storeAsBinary storeKeysAsBinary="false" storeValuesAsBinary="true"

enabled="false" />

<invocationBatching

enabled="true" />

<clustering

mode="replication">

<stateRetrieval

timeout="60000"

logFlushTimeout="65000"

fetchInMemoryState="true"

alwaysProvideInMemoryState="true" />

<sync

replTimeout="50000" />

</clustering>

<jmxStatistics

enabled="true" />

<eviction

maxEntries="-1"

strategy="NONE" />

<expiration

maxIdle="-1" />

</default>

<namedCache

name="LuceneIndexesMetadata">

<clustering

mode="replication">

<stateRetrieval

fetchInMemoryState="true"

logFlushTimeout="30000" />

<sync

replTimeout="50000" />

</clustering>

<locking

lockAcquisitionTimeout="20000"

writeSkewCheck="false"

concurrencyLevel="5000"

useLockStriping="false" />

</properties>

</loader>

</loaders>

</namedCache>

<namedCache

name="LuceneIndexesData">

<clustering

mode="replication">

<stateRetrieval

fetchInMemoryState="true"

logFlushTimeout="30000" />

<sync

replTimeout="50000" />

</clustering>

<locking

lockAcquisitionTimeout="20000"

writeSkewCheck="false"

concurrencyLevel="5000"

useLockStriping="false" />

</properties>

</loader>

</loaders>

</namedCache>

<namedCache

name="LuceneIndexesLocking">

<clustering

mode="replication">

<stateRetrieval

fetchInMemoryState="true"

logFlushTimeout="30000" />

<sync

replTimeout="50000" />

</clustering>

<locking

lockAcquisitionTimeout="20000"

writeSkewCheck="false"

concurrencyLevel="5000"

useLockStriping="false" />

</namedCache>

</infinispan>

jGroups (UDP):

<config xmlns="urn:org:jgroups"

xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/JGroups-2.12.xsd">

<UDP

mcast_addr="${jgroups.udp.mcast_addr:228.10.10.9}"

mcast_port="${jgroups.udp.mcast_port:45599}"

tos="8"

ucast_recv_buf_size="20000000"

ucast_send_buf_size="640000"

mcast_recv_buf_size="25000000"

mcast_send_buf_size="640000"

loopback="true"

discard_incompatible_packets="true"

max_bundle_size="64000"

max_bundle_timeout="30"

ip_ttl="${jgroups.udp.ip_ttl:2}"

enable_bundling="true"

enable_diagnostics="false"

thread_naming_pattern="pl"

thread_pool.enabled="true"

thread_pool.min_threads="2"

thread_pool.max_threads="30"

thread_pool.keep_alive_time="5000"

thread_pool.queue_enabled="false"

thread_pool.queue_max_size="100"

thread_pool.rejection_policy="Discard"

oob_thread_pool.enabled="true"

oob_thread_pool.min_threads="2"

oob_thread_pool.max_threads="30"

oob_thread_pool.keep_alive_time="5000"

oob_thread_pool.queue_enabled="false"

oob_thread_pool.queue_max_size="100"

oob_thread_pool.rejection_policy="Discard"

<FD_SOCK/>

<FD/>

<pbcast.NAKACK use_stats_for_retransmission="false"

exponential_backoff="0"

use_mcast_xmit="true" gc_lag="0"

retransmit_timeout="300,600,1200"

discard_delivered_msgs="true"/>

<pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000" max_bytes="1000000"/>

<pbcast.GMS print_local_addr="true" join_timeout="3000" view_bundling="true"/>

<pbcast.STREAMING_STATE_TRANSFER/>

</config>

And the errors we observe:

10-31-2011 13:53:02 ERROR Hibernate Search: Directory writer-3 interceptors.InvocationContextInterceptor: ISPN000136: Execution error

org.infinispan.util.concurrent.TimeoutException: Replication timeout for tc-cluster-0105-21082

at org.infinispan.remoting.transport.AbstractTransport.parseResponseAndAddToResponseList(AbstractTransport.java:71)

at org.infinispan.remoting.transport.jgroups.JGroupsTransport.invokeRemotely(JGroupsTransport.java:452)

at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:132)

at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:156)

at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:265)

at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:252)

at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:235)

at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:228)

at org.infinispan.interceptors.ReplicationInterceptor.handleCrudMethod(ReplicationInterceptor.java:116)

at org.infinispan.interceptors.ReplicationInterceptor.visitPutKeyValueCommand(ReplicationInterceptor.java:79)