参考文章:使用python操作kafka - 三度 - 博客园
python操作kafka实践 - Small_office - 博客园
python kafka生产者 进程间通信和使用线程池 - **小君哥** - 博客园
安装kafka-python
pip3 install kafka-python
生产者
producer_test.py
from kafka import KafkaProducer
class MyKafkaProducer:
def __init__(self):
self.producer = KafkaProducer(bootstrap_servers='192.168.0.121:9092') # 连接kafka
if __name__ == '__main__':
kafka_producer = MyKafkaProducer()
producer = kafka_producer.producer
msg = "Hello World".encode('utf-8') # 发送内容,必须是bytes类型
producer.send('test', msg) # 发送的topic为test
producer.close()
消费者
import time
from kafka import KafkaConsumer, TopicPartition
class MyKafkaConsumer:
def __init__(self):
# 消费默认consumer group
self.consumer = KafkaConsumer('test', bootstrap_servers=['192.168.0.121:9092'])
# 消费某consumer group
self.consumer2 = KafkaConsumer('test_rhj', group_id='123456', bootstrap_servers=['10.43.35.25:4531'])
if __name__ == '__main__':
kafka_consumer = MyKafkaConsumer()
consumer = kafka_consumer.consumer
for msg in consumer:
recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
print(recv)
# 指定consumer消费某分区(partition)数据
consumer.assign([TopicPartition(topic='test_rhj', partition=0), TopicPartition(topic='test_rhj', partition=1)])
print(consumer.partitions_for_topic("test_rhj")) # 获取test主题的分区信息
print(consumer.assignment())
print(consumer.beginning_offsets(consumer.assignment()))
# 指定偏移量:0, 即从一开始插入的数据都可以查到,这里从指定的分区partition=0实时消费数据
consumer.seek(TopicPartition(topic='test_rhj', partition=0), 0)
for msg in consumer:
recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
print(recv)
# 主动拉取数据:
# 实时获取数据,容易造成性能瓶颈,采用主动拉取数据: 定时获取队列里的数据然后批量处理
consumer.subscribe(topics=('test_rhj',))
index = 0
while True:
msg = consumer.poll(timeout_ms=5) # 从kafka获取消息
print(msg)
time.sleep(2)
index += 1
print('--------poll index is %s----------' % index)
执行此程序,此时会hold住,因为它在等待生产者发送消息!
再次执行生产者,此时会输出:
test:0:9: key=None value=b'Hello World'线程池消费
from kafka import KafkaConsumer
import time, threading
from concurrent.futures import ThreadPoolExecutor
class KafkaConsumerThreadPool(object):
def __init__(self):
self.threadPool = ThreadPoolExecutor(max_workers=8, thread_name_prefix="threading_")
self.hosts = ["ip:port", "ip:port", "ip:port"]
# 打印每个线程id,满足预期,开启了8个线程,每个线程号都不一样;
def operate(self):
consumer = KafkaConsumer("topic_name", bootstrap_servers=self.hosts, group_id="group_id_name")
print(threading.current_thread().name)
for i in consumer:
print(i)
time.sleep(1)
def main(self):
for i in range(8):
self.threadPool.submit(self.operate, )
if __name__ == '__main__':
cla = KafkaConsumerThreadPool()
cla.main()



