Tuesday, January 3, 2017

[google-cloud-sql-discuss] Re: Unix Socket sometimes not present

import pipeline
import os
import random
import time
import uuid
import logging
import sqlalchemy
import MySQLdb
from sqlalchemy import orm
from sqlalchemy.exc import OperationalError
from sqlalchemy.ext import declarative
from sqlalchemy.pool import NullPool
from AppengineCommon.Utils import SqlHelper


LOG = logging.getLogger(__file__)
BASE = declarative.declarative_base()
PRODUCT_A = 100
PRODUCT_B = 100


# For clarity and ease of reproduction, I have moved most functions and classes into one file

'''
Example Run:
# Modify the connection parameters below to match your settings.
db = getSqlAlchemyDb()
createTables(db.engine) # First run only
trace = str(uuid.uuid4()) # Makes finding log messages easier
pipe = SqlConnectionProductA(trace)
pipe.start()
'''


#### Begin database functions

# FIXME: Add your own credentials
def getConnectionInfo():
parameters, connectionString, poolclass = SqlHelper.getDatabaseConnectionInfo()

# For clarity, this is the same connection string used to connect
connectionString = 'mysql+mysqldb://{user}@/{db}?unix_socket=/cloudsql/{dbInstanceName}'

'''
The connectionParameters is in the form:
{
"db": 'mydb',
"user": 'myuser',
"dbInstanceName": 'myinstance'
}

'''
return parameters, connectionString


def getSqlAlchemyDb():
# This fetches our appengine specific information, you should create your own
connectionParameters, connectionString = getConnectionInfo()

return getDbContainer(connectionParameters, connectionString)


def getMysqlDb():
connectionParameters, connectionString = getConnectionInfo()
cloudsql_unix_socket = os.path.join('/cloudsql', connectionParameters["dbInstanceName"])

db = MySQLdb.connect(
unix_socket=cloudsql_unix_socket,
user=connectionParameters["user"].split(":")[0],
passwd=connectionParameters["user"].split(":")[1],
db=connectionParameters["db"])

return db


class DbContainer(object):

def __init__(self, engine, connection):
self.engine = engine
self.connection = connection

def reconnect(self):
try:
self.connection = self.engine.connect()
except OperationalError:
LOG.exception("Failed to reconnect.")


class SessionTransaction(object):
def __init__(self, db=None):
if db is None:
raise Exception("No sql session, cannot proceed")
self.session = orm.sessionmaker(bind=db.engine)()

def __enter__(self):
return self.session

def __exit__(self, type, value, tb):
try:
self.session.commit()
except OperationalError:
self.session.rollback()

self.session.close()


def getEngine(connectionParameters, connectionString):
return sqlalchemy.create_engine(connectionString.format(**connectionParameters), poolclass=NullPool)


def getDbContainer(connectionParameters, connectionString):
engine = getEngine(connectionParameters, connectionString)
maxAttempts = 3
dbConnection = None
for attempt in xrange(0, maxAttempts):
try:
dbConnection = engine.connect()
break
except OperationalError:
LOG.warn("Failed to connect parameters %s. Retrying... %s of %s" % (connectionString.format(**connectionParameters), attempt + 1, maxAttempts))
time.sleep(4)
if attempt == maxAttempts - 1:
LOG.exception("Max reconnect attempts reached.")
raise

return DbContainer(engine, dbConnection)


# A model representing a temporary table used in a metadata update query
class ThrashingTable(BASE):
__tablename__ = 'ThrashingTable'
id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True)
uuid = sqlalchemy.Column(sqlalchemy.String(150))
marker1 = sqlalchemy.Column(sqlalchemy.Integer, nullable=False)
marker2 = sqlalchemy.Column(sqlalchemy.Integer, nullable=False)
transaction_id = sqlalchemy.Column(sqlalchemy.String(150))


def createTables(engine):
ThrashingTable.__table__.create(bind=engine)


#### End database functions

#### Begin Pipeline classes
class SqlConnectionProductA(pipeline.Pipeline):

def run(self, trace):
for x in xrange(0, PRODUCT_A):
yield SqlConnectionProductB(trace)


class SqlConnectionProductB(pipeline.Pipeline):

def run(self, trace):
for x in xrange(0, PRODUCT_B):
yield SqlConnectionTester(trace)


class SqlConnectionTester(pipeline.Pipeline):

def _sqlAlchemyMethod(self):
key = str(uuid.uuid4())
transaction_id = str(uuid.uuid4())
rows = []
# Create some data that can be inserted
for _ in xrange(0, 50):
rows.append(ThrashingTable(
uuid=key,
marker1=random.randint(0, 25),
marker2=random.randint(1481000000, 1481220141),
transaction_id=transaction_id
))

db = getSqlAlchemyDb()

'''
This is a little contrived, but close to what I was doing upon initially seeing the problem.
Essentially all you need are some transactions that cause the database to do
some real work. I've since simplified my sql, but they still do *some* work and
it seems enough to induce the problem. This just exacerbates it.
'''
with SessionTransaction(db) as session:
# Insert some rows into our thrash table
try:
session.bulk_save_objects(rows)
except OperationalError:
LOG.exception("Failed to bulk save.")

def _getValueString(self, uuid, marker1, marker2, transaction_id):
return (uuid, marker1, marker2, transaction_id)

def _mysqlDbMethod(self):
key = str(uuid.uuid4())
transaction_id = str(uuid.uuid4())
rows = []
# Create some data that can be inserted
for _ in xrange(0, 50):
rows.append(self._getValueString(
uuid=key,
marker1=random.randint(0, 25),
marker2=random.randint(1481000000, 1481220141),
transaction_id=transaction_id
))

db = getMysqlDb()
cursor = db.cursor()

insertSql = "INSERT INTO ThrashingTable (uuid, marker1, marker2, transaction_id) VALUES (%s, %s, %s, %s)"

cursor.executemany(insertSql, rows)

def run(self, trace):
#self._mysqlDbMethod()
self._sqlAlchemyMethod()

#### End pipeline classes

Hi Adam and Nick,

Thanks for the replies and sorry for my delayed response, I've been on a bit of vacation.

Nick, I agree with your findings, I've attached a slightly updated example file (simpler queries) that has the ability to run with MySQLdb directly and with SqlAlchemy and the error doesn't exhibit when running MySQLdb directly, but it does while running SqlAlchemy, even with the NullPool ConnectionPool.

However, as Adam correctly points out, the error message is no longer "No such file or directory", but now "Transport endpoint is not connected". I have an example that I've been running since the start, previously giving the stacktrace I originally posted, that is now giving this "Transport endpoint is not connected" instead. I haven't changed anything on my end, so I have to assume in the weeks that I've been working on this problem something has changed on the google end.

Adam, to your statement that "all available concurrent connections" are saturated. My reading of the documentation stated I should be able to have about 4000 concurrent connections (https://cloud.google.com/sql/faq#sizeqps). When limiting my "max_concurrent_requests" to 72 I am seeing the attached error rates across 18 instances (since my instances is limited to a max of 6 simultaneous requests). It seems like this should be well below the 4000 connection limit, by a couple of orders of magnitude, and yet I am still seeing this error occur. You also mentioned getting 5k tasks in the queue, if the queues are rate limited/concurrent request limited, the size of the queue backlog shouldn't have any effect on concurrent sql connections correct? Or am I misunderstanding something here?

Also, just to clarify, I don't believe there to a "bad" instance floating around out there. Initially, I did not see instances "recover" with subsequent requests, but a slower rate (of about 6/m) proved that bit of information to be incorrect. You can however see by my attached screenshot, that instances vary in their error rate.

I really appreciate the both of you getting back to me and Nick for sticking with this for so long. Unfortunately, work priorities have come up and since I have a work around with the MySQLdb route, I'm going to just take SqlAlchemy out of the mix and move forward with the working solution. If there are other contributions I can make in my spare time, let me know and I'll happily help. Otherwise, this is me "signing off".

Cheers,

Myles

On Monday, January 2, 2017 at 3:29:19 PM UTC-7, Adam (Cloud Platform Support) wrote:
I've tried reproducing this myself using your code and haven't seen the 'No such file or directory' error either. The only errors I see are (normal) errors about 'Deadlock found when trying to get lock' due to all the concurrently executing transactions and 'Transport endpoint is not connected' once I reach about 5k tasks and 10+ instances saturating all available concurrent connections.

Perhaps I'll simply need to wait until I get a 'bad' instance, but so far it hasn't occurred.

On Wednesday, December 21, 2016 at 7:42:16 PM UTC-5, paynen wrote:
Hey Myles,

Other than the hint on connection pooling, I've been unable to reproduce the issues you've seen, despite using a pooled connection as well. Is this still occurring despite switching to a non-pooled connection model? 

Regards,

Nick
Cloud Platform Community Support


On Monday, December 19, 2016 at 4:33:04 PM UTC-5, paynen wrote:
Hey Myles,

You've done an extraordinary job in cataloging the information needed to look into this further. Apologies that I've not got anything definitive to relate, as I'm in the process of attempting to replicate this behaviour. It appears to be related to connection pooling, although I'm not sure exactly how. I hope to update this thread within the next 2 days with more information.

Regards,

Nick
Cloud Platform Community Support

On Friday, December 9, 2016 at 3:20:17 PM UTC-5, Myles Bostwick wrote:
Hi Nick,

I've been continuing to further isolate the problem by reducing the rate at which I process data. The error appears to fluctuate on an instance when at a slower rate (in this case 6/m). I've attached some screenshots of my log, on one instance. The info messages are successful calls, the warnings are failed in at least one connection attempt and the criticals are failed in all three attempts to connect.  I am still seeing a pretty high failure rate of around 30-40 percent vs about 50% at a rate of 10/s. At this point I don't have a theory as to what is happening, 6/m is about 2 orders of magnitude slower than our ideal rate of 10/s.

I'm going to try 2 more tests

1. Change MySQLdb to version 1.2.4 and 1.2.5 (presently at "latest" which is 1.2.4b4 apparently)
2. Try these iterations on CloudSQL First Generation now that the inefficient SQL is no longer present.

Thanks again for looking, I hope we can get to a solution on this.

Cheers,

Myles

On Thursday, December 8, 2016 at 2:25:58 PM UTC-7, Myles Bostwick wrote:
Hi Nick,

I appreciate you getting back to me.

I've attached an example that produces the behavior, through testing I've come to better understand a little what's going on. Once reaching a certain rate, while processing mysql is under load, the error is returned. Originally I created an example that just sent "SELECT 1" to mysql and could not induce the error, so MySQL has to be under some load.

The example I've attached induces the error, though there are two classes of errors:

1. The "No such file" error I've originally reported
2. The understandable Deadlock error from mysql

I'm not concerned about #2 as that's just a SQL optimization I've already taken care of, but I still receive the "No such file" error in my production code without a single deadlock error.

"The patterns and frequences of connections on your instances"

I have a taskqueue that is setup to process all my sql connections, so that 1, requests aren't hampered by SQL operations and 2, we can rate limit the interactions with MySQL. My autoscale settings are set to restrict to 6 connections, due to the 12 connection limit, I wanted to give it some room.

"The way in which you've determined it's isolated to a given instance"

I don't think I was clear when I described this, once an instance starts exhibiting this symptom, subsequent requests to the instance all exhibit the symptom. I determined this just by checking the instance id in the log messages and through restricting to a given instance id.

Hopefully my attached example will enable reproduction. It's a little messy because I consolidated several files into one file for ease of transport, but it should be fairly straightforward.

Let me know if you need me to file this away to the issue tracker you mentioned, I'm happy to do so.

Cheers,

Myles

On Tuesday, December 6, 2016 at 1:44:54 PM UTC-7, paynen wrote:
Hey Myles,

An issue like this would be best reported in the Public Issue Tracker for Cloud SQL.

Nonetheless, we can continue to work on it here until we can determine more accurately what should go in the report. There are some more pieces of information that could be relevant here:

* The patterns and frequencies of connections on your instances 

* The way in which you've determined it's isolated to a given instance

* A description of the pipeline task you're performing so we can attempt to reproduce the issue that way

Feel free to add any more information you think could help in reproducing this issue.

Regards,

Nick
Cloud Platform Community Support

On Monday, December 5, 2016 at 1:02:56 PM UTC-5, Myles Bostwick wrote:
Hello All,

I've been playing around with the pipelines library (https://github.com/GoogleCloudPlatform/appengine-pipelines) in my standard appengine environment and managed to get enough parallel instances running to cause problems. Sometimes, persistent to an instance it seems, I don't have a unix socket (as seen in the below stacktrace). Things I have tried to mitigate this:

  1. On failed connect, try 3 times with 4 seconds between attempts (since I have a 60 second max request time, this seems HUGE)
  2. Reducing concurrent requests per instance to 8
  3. Reducing max concurrent requests to 20
  4. Upgrading from First Generation to Second Generation Cloud SQL instances
  5. Restricting to a single instance (which does "solve" the problem, but doesn't meet my throughput goals)
I've consistently been able to reproduce this issue, in its inconsistent nature. My final recourse is going to be wait the 3 times, then connect via IP instead. Any thoughts suggestions in addition to this would be greatly appreciated.

Thanks in advance,

Myles


...
  File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/engine/base.py", line 1778, in connect return self._connection_cls(self, **kwargs) File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/engine/base.py", line 60, in __init__ self.__connection = connection or engine.raw_connection() File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/engine/base.py", line 1847, in raw_connection return self.pool.unique_connection() File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 280, in unique_connection return _ConnectionFairy._checkout(self) File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 644, in _checkout fairy = _ConnectionRecord.checkout(pool) File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 440, in checkout rec = pool._do_get() File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 1057, in _do_get return self._create_connection() File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 285, in _create_connection return _ConnectionRecord(self) File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 411, in __init__ self.connection = self.__connect() File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/pool.py", line 538, in __connect connection = self.__pool._creator() File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/engine/strategies.py", line 96, in connect connection_invalidated=invalidated File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/util/compat.py", line 199, in raise_from_cause reraise(type(exception), exception, tb=exc_tb) File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/engine/strategies.py", line 90, in connect return dialect.connect(*cargs, **cparams) File "/base/data/home/apps/s~hydrovu-dev/api:2-0-0.397476056640579045/lib/sqlalchemy/engine/default.py", line 377, in connect return self.dbapi.connect(*cargs, **cparams) File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/MySQLdb-1.2.4b4/MySQLdb/__init__.py", line 81, in Connect return Connection(*args, **kwargs) File "/base/data/home/runtimes/python27/python27_lib/versions/third_party/MySQLdb-1.2.4b4/MySQLdb/connections.py", line 190, in __init__ super(Connection, self).__init__(*args, **kwargs2) OperationalError: (OperationalError) (2062, 'Cloud SQL socket open failed with error: No such file or directory') None None

--
You received this message because you are subscribed to the Google Groups "Google Cloud SQL discuss" group.
To unsubscribe from this group and stop receiving emails from it, send an email to google-cloud-sql-discuss+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/google-cloud-sql-discuss/dc557cc9-0a00-4b02-adde-9df9976b87cd%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

No comments:

Post a Comment